-
Notifications
You must be signed in to change notification settings - Fork 171
Open
Description
Hi,
I recently got stuck using the anndata concat function and it took me a while to realize that adatas with duplicated gene names cause an error. I thought it may be an easy fix to test and have clearer error messaging for duplicated gene names.
Thanks!
import anndata as an
import pandas as pd
import numpy as np
x = np.zeros((3,3))
obs = pd.DataFrame(index=['cell1', 'cell2', 'cell3'])
var = pd.DataFrame(index=['gene1', 'gene2', 'gene2'])
test = an.AnnData(X=x, obs=obs, var=var)
an.concat((test, test))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-27-68727e4bbc1e> in <module>
4 var = pd.DataFrame(index=['gene1', 'gene2', 'gene2'])
5 test = an.AnnData(X=x, obs=obs, var=var)
----> 6 an.concat((test, test))
~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in concat(adatas, axis, join, merge, uns_merge, label, keys, index_unique, fill_value, pairwise)
812
813 # Annotation for other axis
--> 814 alt_annot = merge_dataframes(
815 [getattr(a, alt_dim) for a in adatas], alt_indices, merge
816 )
~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in merge_dataframes(dfs, new_index, merge_strategy)
524
525 def merge_dataframes(dfs, new_index, merge_strategy=merge_unique):
--> 526 dfs = [df.reindex(index=new_index) for df in dfs]
527 # New dataframe with all shared data
528 new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)
~/miniconda3/envs/dev/lib/python3.8/site-packages/anndata/_core/merge.py in <listcomp>(.0)
524
525 def merge_dataframes(dfs, new_index, merge_strategy=merge_unique):
--> 526 dfs = [df.reindex(index=new_index) for df in dfs]
527 # New dataframe with all shared data
528 new_df = pd.DataFrame(merge_strategy(dfs), index=new_index)
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
310 @wraps(func)
311 def wrapper(*args, **kwargs) -> Callable[..., Any]:
--> 312 return func(*args, **kwargs)
313
314 kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in reindex(self, *args, **kwargs)
4171 kwargs.pop("axis", None)
4172 kwargs.pop("labels", None)
-> 4173 return super().reindex(**kwargs)
4174
4175 def drop(
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/generic.py in reindex(self, *args, **kwargs)
4804
4805 # perform the reindex on the axes
-> 4806 return self._reindex_axes(
4807 axes, level, limit, tolerance, method, fill_value, copy
4808 ).__finalize__(self, method="reindex")
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy)
4017 index = axes["index"]
4018 if index is not None:
-> 4019 frame = frame._reindex_index(
4020 index, method, copy, level, fill_value, limit, tolerance
4021 )
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/frame.py in _reindex_index(self, new_index, method, copy, level, fill_value, limit, tolerance)
4036 new_index, method=method, level=level, limit=limit, tolerance=tolerance
4037 )
-> 4038 return self._reindex_with_indexers(
4039 {0: [new_index, indexer]},
4040 copy=copy,
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/generic.py in _reindex_with_indexers(self, reindexers, fill_value, copy, allow_dups)
4870
4871 # TODO: speed up on homogeneous DataFrame objects
-> 4872 new_data = new_data.reindex_indexer(
4873 index,
4874 indexer,
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/internals/managers.py in reindex_indexer(self, new_axis, indexer, axis, fill_value, allow_dups, copy, consolidate, only_slice)
1299 # some axes don't allow reindexing with dups
1300 if not allow_dups:
-> 1301 self.axes[axis]._can_reindex(indexer)
1302
1303 if axis >= self.ndim:
~/miniconda3/envs/dev/lib/python3.8/site-packages/pandas/core/indexes/base.py in _can_reindex(self, indexer)
3474 # trying to reindex on an axis with duplicates
3475 if not self._index_as_unique and len(indexer):
-> 3476 raise ValueError("cannot reindex from a duplicate axis")
3477
3478 def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
ValueError: cannot reindex from a duplicate axis
gokceneraslan