Skip to content

better error messaging for concatΒ #501

@gheimberg

Description

@gheimberg

Hi,
I recently got stuck using the anndata concat function and it took me a while to realize that adatas with duplicated gene names cause an error. I thought it may be an easy fix to test and have clearer error messaging for duplicated gene names.

Thanks!

import anndata as an
import pandas as pd
import numpy as np
x = np.zeros((3,3))
obs = pd.DataFrame(index=['cell1', 'cell2', 'cell3'])
var = pd.DataFrame(index=['gene1', 'gene2', 'gene2'])
test = an.AnnData(X=x, obs=obs, var=var)
an.concat((test, test))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-27-68727e4bbc1e> in <module>
      4 var = pd.DataFrame(index=['gene1', 'gene2', 'gene2'])
      5 test = an.AnnData(X=x, obs=obs, var=var)
----> 6 an.concat((test, test))

~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in concat(adatas, axis, join, merge, uns_merge, label, keys, index_unique, fill_value, pairwise)
    812 
    813     # Annotation for other axis
--> 814     alt_annot = merge_dataframes(
    815         [getattr(a, alt_dim) for a in adatas], alt_indices, merge
    816     )

~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in merge_dataframes(dfs, new_index, merge_strategy)
    524 
    525 def merge_dataframes(dfs, new_index, merge_strategy=merge_unique):
--> 526     dfs = [df.reindex(index=new_index) for df in dfs]
    527     # New dataframe with all shared data
    528     new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)

~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in <listcomp>(.0)
    524 
    525 def merge_dataframes(dfs, new_index, merge_strategy=merge_unique):
--> 526     dfs = [df.reindex(index=new_index) for df in dfs]
    527     # New dataframe with all shared data
    528     new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
    310         @wraps(func)
    311         def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 312             return func(*args, **kwargs)
    313 
    314         kind = inspect.Parameter.POSITIONAL_OR_KEYWORD

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
   4171         kwargs.pop("axis", None)
   4172         kwargs.pop("labels", None)
-> 4173         return super().reindex(**kwargs)
   4174 
   4175     def drop(

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
   4804 
   4805         # perform the reindex on the axes
-> 4806         return self._reindex_axes(
   4807             axes, level, limit, tolerance, method, fill_value, copy
   4808         ).__finalize__(self, method="reindex")

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
   4017         index = axes["index"]
   4018         if index is not None:
-> 4019             frame = frame._reindex_index(
   4020                 index, method, copy, level, fill_value, limit, tolerance
   4021             )

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
   4036             new_index, method=method, level=level, limit=limit, tolerance=tolerance
   4037         )
-> 4038         return self._reindex_with_indexers(
   4039             {0: [new_index, indexer]},
   4040             copy=copy,

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
   4870 
   4871             # TODO: speed up on homogeneous DataFrame objects
-> 4872             new_data = new_data.reindex_indexer(
   4873                 index,
   4874                 indexer,

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/internals/managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)
   1299         # some axes don't allow reindexing with dups
   1300         if not allow_dups:
-> 1301             self.axes[axis]._can_reindex(indexer)
   1302 
   1303         if axis >= self.ndim:

~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
   3474         # trying to reindex on an axis with duplicates
   3475         if not self._index_as_unique and len(indexer):
-> 3476             raise ValueError("cannot reindex from a duplicate axis")
   3477 
   3478     def reindex(self, target, method=None, level=None, limit=None, tolerance=None):

ValueError: cannot reindex from a duplicate axis

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions