scirpy.datasets.wu2020

Contents

scirpy.datasets.wu2020#

scirpy.datasets.wu2020()#

Return the dataset from [WMdA+20] as MuData object.

140k cells, of which 100k have TCRs. :rtype: MuData

Note

Scirpy example datasets are managed through Pooch.

By default, the dataset will be downloaded into your operating system’s default cache directory (See pooch.os_cache() for more details). If it has already been downloaded, it will be retrieved from the cache.

You can override the default cache dir by setting the SCIRPY_DATA_DIR environment variable to a path of your preference.

This is how the dataset was processed:

    # ---
# jupyter:
#   jupytext:
#     cell_metadata_filter: -all
#     formats: py:light,ipynb
#     notebook_metadata_filter: -kernelspec
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.14.4
# ---

# +
# %load_ext autoreload
# %autoreload 2
import sys

import scanpy as sc

# +
sys.path.insert(0, "../../..")
import os
from glob import glob
from multiprocessing import Pool

import anndata
import numpy as np
import pandas as pd
from mudata import MuData

import scirpy as ir

# + language="bash"
# mkdir -p data
# cd data
# wget --no-verbose -O GSE139555_raw.tar "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file"
# wget --no-verbose -O GSE139555_tcell_metadata.txt.gz "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE139555&format=file&file=GSE139555%5Ftcell%5Fmetadata%2Etxt%2Egz"
# tar xvf GSE139555_raw.tar

# + language="bash"
# cd data
# for f in *.matrix.mtx.gz; do
#   dirname=${f/\.matrix\.mtx\.gz/}
#   mkdir $dirname
#   mv $dirname.genes.tsv.gz $dirname/genes.tsv.gz
#   mv $dirname.matrix.mtx.gz $dirname/matrix.mtx.gz
#   mv $dirname.barcodes.tsv.gz $dirname/barcodes.tsv.gz
#   mv $dirname.filtered_contig_annotations.csv.gz $dirname/filtered_contig_annotations.csv.gz
#   # fix missing feature type column
#   zcat $dirname/genes.tsv.gz |  awk '{print $0 "\tGene Expression"}' | gzip > $dirname/features.tsv.gz
# done
# -

mtx_paths = glob("data/GSM*")

mtx_paths

metadata_all = pd.read_csv("data/GSE139555_tcell_metadata.txt.gz", sep="\t", index_col=0)

umap = metadata_all[["UMAP_1", "UMAP_2"]]

metadata = metadata_all[["ident", "patient", "sample", "source", "clonotype"]]

metadata = metadata.rename(columns={"clonotype": "clonotype_orig", "ident": "cluster_orig"})

metadata


def _load_adata(path):
    sample_id = path.split("-")[-1].upper()
    obs = metadata.loc[metadata["sample"] == sample_id, :]
    umap_coords = umap.loc[metadata["sample"] == sample_id, :].values
    adata = sc.read_10x_mtx(path)
    adata_tcr = ir.io.read_10x_vdj(os.path.join(path, "filtered_contig_annotations.csv.gz"))
    adata.obs_names = [f"{sample_id}_{barcode}" for barcode in adata.obs_names]
    adata_tcr.obs_names = [f"{sample_id}_{barcode}" for barcode in adata_tcr.obs_names]
    # subset to cells with annotated metadata only
    adata = adata[obs.index, :].copy()
    # all metadata except clonotyp_orig in GEX modality
    adata.obs = adata.obs.join(obs.drop(columns=["clonotype_orig"]), how="inner")
    assert adata.shape[0] == umap_coords.shape[0]
    adata.obsm["X_umap_orig"] = umap_coords
    # #356: workaround for https://github.com/scverse/muon/issues/93
    adata_tcr.X = np.ones((adata_tcr.shape[0], 0))
    # clonotype orig column in TCR modality
    adata_tcr.obs = adata_tcr.obs.join(obs.loc[:, ["clonotype_orig"]], how="left", validate="one_to_one")
    return adata, adata_tcr


p = Pool()
adatas = p.map(_load_adata, mtx_paths)
p.close()

adatas, adatas_airr = zip(*adatas)

adata = anndata.concat(adatas)

adata_airr = anndata.concat(adatas_airr)

# inverse umap X -coordinate
adata.obsm["X_umap_orig"][:, 0] = np.max(adata.obsm["X_umap_orig"][:, 0]) - adata.obsm["X_umap_orig"][:, 0]

mdata = MuData({"gex": adata, "airr": adata_airr})

mdata

adata.obs

adata_airr.obs

mdata.obs

sc.pl.embedding(adata, "umap_orig", color="cluster_orig", legend_loc="on data")

mdata.write_h5mu("wu2020.h5mu", compression="lzf")