00 - Setup#

import os
import stampede as st
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

from pydeseq2.ds import DeseqStats
from pydeseq2.dds import DeseqDataSet

# dictionary with input files per slide
slides = {
    1: {
        "exprmat": "exprMat_file.csv.gz",
        "metadata": "metadata_file.csv.gz",
        "fov_positions": "fov_positions_file.csv.gz",
    }
}
# optional filepath prefix
data_dir = "data"

os.makedirs(st.config["adata_dir"], exist_ok=True)

01 - Read#

# table mapping the FOVs per slide to each sample
# column "fovs" accepts any combination of comma separated values (e.g. 1,2,3) and ranges (e.g. 4-6)
samples_df = pd.read_table("data/sample2fov.csv", sep=",")
st.validate_input(slides, samples_df, data_dir)
samples_df

	sample	fovs	slide
0	s1	10-18	1
1	s2	19-27	1
2	s3	28-36	1

# combine the input files into a single cell object and write it to file
adata_file = os.path.join(st.config["adata_dir"], "raw_data.h5ad")
st.read_cosmx(
    slides,
    samples_df,
    adata_file,
    data_dir=data_dir,
    overwrite=False,
    verbose=True,
)

adata_file already exists and overwrite=False

'adatas/raw_data.h5ad'

adata = sc.read_h5ad(adata_file)
adata.obs

	fov	cell_ID	slide	slide-fov	Area	AspectRatio	CenterX_local_px	CenterY_local_px	Width	Height	...	nCount	nCountPerCell	nFeaturePerCell	propNegativeCellAvg	complexityCellAvg	errorCtPerCellEstimate	percOfDataFromErrorPerCell	qcFlagsFOV	sample	fovs
slide-fov-cell_ID
1-10-1	10	1	1	1-10	4260	0.97	2902	613	77	75	...	140803	863.822086	670.730061	0.000315	1.245668	87.131902	0.100868	Pass	s1	10-18
1-10-2	10	2	1	1-10	5228	0.95	3007	623	83	87	...	140803	863.822086	670.730061	0.000315	1.245668	87.131902	0.100868	Pass	s1	10-18
1-10-3	10	3	1	1-10	3204	0.89	2947	649	71	63	...	140803	863.822086	670.730061	0.000315	1.245668	87.131902	0.100868	Pass	s1	10-18
1-10-4	10	4	1	1-10	22940	0.81	2833	746	191	237	...	140803	863.822086	670.730061	0.000315	1.245668	87.131902	0.100868	Pass	s1	10-18
1-10-5	10	5	1	1-10	4404	0.76	2990	691	93	71	...	140803	863.822086	670.730061	0.000315	1.245668	87.131902	0.100868	Pass	s1	10-18
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1-36-271	36	271	1	1-36	4264	0.88	1812	2281	81	71	...	110510	401.854545	338.160000	0.000438	1.168196	46.031818	0.114548	Pass	s3	28-36
1-36-272	36	272	1	1-36	5024	0.79	397	3328	87	69	...	110510	401.854545	338.160000	0.000438	1.168196	46.031818	0.114548	Pass	s3	28-36
1-36-273	36	273	1	1-36	5460	0.98	323	3370	83	81	...	110510	401.854545	338.160000	0.000438	1.168196	46.031818	0.114548	Pass	s3	28-36
1-36-274	36	274	1	1-36	1888	0.96	455	3372	47	49	...	110510	401.854545	338.160000	0.000438	1.168196	46.031818	0.114548	Pass	s3	28-36
1-36-275	36	275	1	1-36	3880	0.97	398	3397	69	67	...	110510	401.854545	338.160000	0.000438	1.168196	46.031818	0.114548	Pass	s3	28-36

27625 rows × 79 columns

02 - QC#

02a - Slide QC#

fig, ax = st.pl.ncell_per_condition(adata, "sample")
plt.show()

../_images/a26202d31d9503c67505b6848bbd5eae843426d019634290cae69045af847aac.png

st.pp.slide_qc(adata, slides, ["sample"], data_dir=data_dir)
adata.uns["fov_metadata"].head()

	slide	fov	x	y	nCounts	nCell	mean_CountsPerCell	nCount_negprobes	mean_NegProbe-CountsPerCell	nCount_falsecode	mean_FalseCode-CountsPerCell	mean_CellSize	sample
slide-fov
1-10	1	10	19773	149405	140803	163	863.822086	46	0.282209	480	2.944785	63.957736	s1
1-11	1	11	24029	149405	752088	848	886.896226	274	0.323113	2569	3.029481	54.344592	s1
1-12	1	12	28285	149405	50213	47	1068.361702	21	0.446809	151	3.212766	74.681880	s1
1-13	1	13	19773	145149	1241593	1866	665.376742	521	0.279207	5842	3.130761	52.223075	s1
1-14	1	14	24029	145149	2638265	3391	778.019758	916	0.270127	10325	3.044825	52.564838	s1

# this dataset was subset from a much larger slide
# the plot limits in this tutorial are extended to provide a sense of scale
fig, axs = st.pl.slide_qc(
    adata, columns=["sample", "nCell", "mean_CountsPerCell"], figsize=(2.5, 0.5), legend_kwargs={"ncols": 3}
)
ylim = axs[1, 0].get_ylim()
axs[1, 0].set_ylim(ylim[0]-10_000, ylim[1]+20_000)
plt.show()

fig, axs = st.pl.slide_qc(
    adata, columns=["mean_NegProbe-CountsPerCell", "mean_FalseCode-CountsPerCell", "mean_CellSize"], figsize=(2.5, 0.5)
)
ylim = axs[1, 0].get_ylim()
axs[1, 0].set_ylim(ylim[0]-10_000, ylim[1]+20_000)
plt.show()

../_images/21032582737cc9add762a797300b0ae7ef9aa2ad2ffebf07dba1888975029d2a.png

../_images/c4a8963596ea02016b77f0adacc7ba4872b962b3b221f34a95d5f23d77f09e37.png

Visualizing Field of View edge-effects#

Cells on the edges of an Field of View (FOV) may have been truncated. Furthermore, we tend to detect less RNA in them. There may also be a bias to the corners and/or one or two edges in the dataset.

# plot the average nCount_RNA per pixel over all FOVs
fig, axs = st.pl.avg_per_pixel(
    adata, column='nCount_RNA', normalize_cell_area=True
)
plt.show()

# plot the cell area for all FOVs. 
# (do not normalize for cell area when plotting the cell area)
fig, axs = st.pl.avg_per_pixel(
    adata, column='Area.um2', normalize_cell_area=False
)
plt.show()

../_images/a7adcfeaf0cb005076b60c558b96ecd6e85752128c2d8c683b3dd562898611fe.png

../_images/6e13160054ef02692720bfc64cdda20dcc5919054b603511c8375f9036d2df31.png

# visualize the effect of removing cells based on distance to FOV edge
fig, axs = st.pl.correlations(adata, xcolumn="dist2edge_px", ycolumn="nCount_RNA", max_quantile=0.7)

dist2edge_threshold = 50
print(f"{dist2edge_threshold=}")
axs[1, 1].axvline(dist2edge_threshold, color="red")
plt.show()

dist2edge_threshold=50

../_images/95bc5033debe908348ca28d0f6ea6c657dafdfd5a8869c61f7b90cbf1ac62c36.png

adata = st.pp.filter_edges(adata, dist2edge_threshold)

1_729 cells filtered out, 25_896 cell remaining.

02b - gene QC#

# remove non-PBMC genes
fname = os.path.join(data_dir, "blacklist_pbmc_6k.txt")
blacklist = pd.read_table(fname, header=None)[0].to_list()

before = len(adata.var)
adata = adata[:, ~adata.var.index.isin(blacklist)].copy()
after = len(adata.var)
print(before - after, "genes filtered out")

904 genes filtered out

# mark core immune genes
fname = os.path.join(data_dir, "core_immune_genes_6k.txt")
whitelist = pd.read_table(fname, header=None)[0].to_list()
adata.var["is_core_immune_gene"] = adata.var.index.isin(whitelist)

# determine a noise threshold for gene expression using the mean expression 
# and standard deviation of the negative control probes
st.pp.gene_qc(adata)
adata.var

	is_core_immune_gene	is_negctrl	is_sysctrl	nCell	pctCell	nTranscript	mean_Transcript	above_noise
A1BG	False	False	False	5625	21.721501	6883	0.265794	True
A2M	True	False	False	635	2.452116	658	0.025409	True
AAAS	False	False	False	1416	5.468026	1496	0.057770	True
AAK1	True	False	False	2175	8.398981	2363	0.091250	True
AAMP	False	False	False	1483	5.726753	1591	0.061438	True
...	...	...	...	...	...	...	...	...
ZSWIM6	True	False	False	854	3.297807	888	0.034291	True
ZWINT	True	False	False	969	3.741891	1019	0.039350	True
ZYG11B	False	False	False	1855	7.163268	2002	0.077309	True
ZYX	True	False	False	2650	10.233241	3012	0.116311	True
ZZZ3	True	False	False	1606	6.201730	1709	0.065995	True

5614 rows × 8 columns

fig, ax = st.pl.noise_threshold(adata, max_quantile=0.95)
plt.show()

../_images/d023f4b9e4f9bf179be7683ecd8d2f52148d300c2b6a462174d8f728b896aa20.png

adata = st.pp.filter_genes(adata)

377 genes filtered out, 5_237 genes remaining.

02c - cell QC#

fig, axs = st.pl.violin(
    adata, 
    ["nCount_RNA", "nFeature_RNA", "Area.um2", "nCount_negprobes", "nCount_falsecode"],
    log_scale=(True, True, True, False, False)
)
plt.show()

../_images/7d9054dd1bb8f7f41786d40bf1ac373d972b5c98c249f53eefa41823da777d98.png

adata = st.pp.filter_cells(
    adata, 
    falsecode_max = 5,
    negprobe_max = 2,
    ntranscript_min = 250,
    ntranscript_max = 1500,
    area_min = 25,
    area_max = 110,
)

9_123 cells filtered out, 16_773 cells remaining.

st.pp.gene_qc_postfilter(adata)
adata.var[["nCell", "nCell_postfilter", "pctCell", "pctCell_postfilter"]]

	nCell	nCell_postfilter	pctCell	pctCell_postfilter
A1BG	5625	3674	21.721501	21.90
A2M	635	417	2.452116	2.49
AAAS	1416	952	5.468026	5.68
AAK1	2175	1434	8.398981	8.55
AAMP	1483	922	5.726753	5.50
...	...	...	...	...
ZSWIM6	854	547	3.297807	3.26
ZWINT	969	605	3.741891	3.61
ZYG11B	1855	1180	7.163268	7.04
ZYX	2650	1766	10.233241	10.53
ZZZ3	1606	1067	6.201730	6.36

5237 rows × 4 columns

st.pp.cell_qc_postfilter(adata)
adata.obs[["nFeature_RNA", "nFeature_RNA_postfilter", "nCount_RNA", "nCount_RNA_postfilter"]]

	nFeature_RNA	nFeature_RNA_postfilter	nCount_RNA	nCount_RNA_postfilter
slide-fov-cell_ID
1-10-1	627	560	821	748
1-10-2	873	771	1185	1067
1-10-6	648	573	838	748
1-10-8	831	716	1015	888
1-10-9	666	583	823	730
...	...	...	...	...
1-36-269	750	643	900	786
1-36-270	725	629	863	754
1-36-272	847	718	1033	894
1-36-273	814	711	989	871
1-36-275	660	562	782	676

16773 rows × 4 columns

fig, ax = st.pl.ncell_per_condition(adata, columns = ["sample"])
plt.show()

../_images/1f03e1b495ab2e3a0edbb8c74429140f97305e64b070b45f32696dfb37abd4d3.png

03 - Dimensionality reduction#

st.pp.binarize(adata)

binary layer set as adata.X

st.pp.dim_red(adata, use_genes="is_core_immune_gene")
fig, ax = st.pl.scree(adata)
plt.show()

../_images/a5ad4eaede38027cbf538774d4a2370d0127c1cfad9404d2d4c964c5bfd5fec9.png

fig_ax_list = st.pl.dim_red(
    adata, 
    columns=["sample"],
    subset_size=1000,
)
plt.show()

../_images/96addafce410beeb94861757737551479edb82edabddf9879c41adb6489cf3c6.png

sc.pp.neighbors(
    adata,
    use_rep="X_svd",
    key_added="neighbors_svd",
)

sc.tl.umap(
    adata,
    neighbors_key="neighbors_svd",
    key_added="umap_svd",
)

sc.pl.embedding(
    adata,
    basis="umap_svd",
    color="sample",
    alpha=0.5,
)

../_images/13d6be289aa4c55a177d68fd37ae53cafa984707d5e1b953ae6fb599759372b7.png

04 - Clustering#

st.pp.knn_count_smoothing(
    adata,
    neighbors_key="neighbors_svd",
)

KNN_binary_mean layer set as adata.X

markerdict = {
    "T_cells": [
        "CD3D", "TRBC1","IL7R", "CD8B",
    ],
    "B_cells": [
        "MS4A1", "CD79A", "CD79B", "CD37"
    ],
    "NK_cells": [
        "NKG7", "GNLY", "KLRD1", "FCGR3A"
    ],
    "Monocytes_Macrophages": [
        "LYZ", "S100A8", "CTSD", "CD14",
    ],
}

for ctype, markers in markerdict.items():
    print(ctype)
    sc.pl.embedding(
        adata,
        basis="umap_svd",
        color=markers,
        layer="KNN_binary_mean",
        ncols=4,
        alpha=0.5,
    )

T_cells

../_images/de72ca37f88bc1825abb2f48fca3ad9f5cde96228a208bf54fd0c4bf4affbf7b.png

B_cells

../_images/b44db9b82f9b89e3a5f3291aae8351f169e834f289338b9457fef4b04c12d281.png

NK_cells

../_images/53e24936bba54160d3f5bdf93aefc9226d90e4345a3135e9746ca04b5cc7ac08.png

Monocytes_Macrophages

../_images/73a0111bea716aa5695c9bc22cce7a064b9002ff68d3f31588ace2a3bf73fc64.png

res = 0.5
sc.tl.leiden(
    adata,
    resolution=res,
    neighbors_key="neighbors_svd",
    key_added=f"leiden_res_{res:.2f}",
    flavor="igraph",
    n_iterations=2,
    directed=False,
    random_state=42,
)

sc.tl.rank_genes_groups(adata, f"leiden_res_{res:.2f}", n_genes=6, layer="binary")
sc.tl.dendrogram(adata, f"leiden_res_{res:.2f}", use_rep="X_svd")
sc.pl.rank_genes_groups_matrixplot(
    adata,
    n_genes=6,
)

WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.

../_images/974ea3d8f5898dbf166aab669e27540b07828abeae4c419117f5bd47c492eaa4.png

cluster2ctype = {
    "0": "T_cells",
    "1": "NK_cells",
    "2": "B_cells",
    "3": "Myeloid_cells",
}
adata.obs["celltype_lvl1"] = adata.obs[f"leiden_res_{res:.2f}"].astype(str).replace(cluster2ctype).astype("category")

sc.pl.embedding(
    adata,
    basis="umap_svd",
    color="celltype_lvl1",
    alpha=0.5,
)

../_images/47341571b8a7390a033b426c1e38f10fe3f733d22606697f956f5d99be5a4607.png

05 - Differential expression analysis#

st.pp.combine_obs_columns(adata, ["sample", "celltype_lvl1"], "sample_celltype")
adata.obs["sample_celltype"].value_counts().head()

sample_celltype
s1_T_cells          4003
s3_T_cells          3912
s2_T_cells          2483
s2_NK_cells         1359
s2_Myeloid_cells    1351
Name: count, dtype: int64

05a - using detection rates & pbGLM#

det_rate_df = st.pp.detection_rates(adata, "sample_celltype")
det_rate_df

	s1_T_cells	s3_T_cells	s2_T_cells	s2_NK_cells	s2_Myeloid_cells	s3_Myeloid_cells	s1_Myeloid_cells	s3_NK_cells	s1_NK_cells	s3_B_cells	s2_B_cells	s1_B_cells
A1BG	0.166938	0.186624	0.192226	0.180092	0.241642	0.234460	0.198680	0.152472	0.154464	0.222320	0.177528	0.168530
A2M	0.022996	0.016689	0.019951	0.024067	0.019234	0.015267	0.023463	0.016206	0.023768	0.027226	0.022711	0.013955
AAAS	0.046860	0.046575	0.053780	0.039238	0.046576	0.037077	0.039669	0.037278	0.053029	0.051782	0.071536	0.051186
AAK1	0.076153	0.075050	0.077622	0.067599	0.053272	0.056707	0.068781	0.092419	0.077461	0.051782	0.042722	0.019593
AAMP	0.044045	0.040120	0.048625	0.032338	0.047914	0.047983	0.065840	0.038899	0.047833	0.049051	0.071536	0.059978
...	...	...	...	...	...	...	...	...	...	...	...	...
ZSWIM6	0.026101	0.020126	0.022668	0.024756	0.045907	0.041439	0.040629	0.021068	0.022061	0.027226	0.017018	0.030966
ZWINT	0.032164	0.033675	0.026749	0.031648	0.018569	0.032715	0.025359	0.019447	0.032328	0.032678	0.028416	0.019593
ZYG11B	0.061227	0.052116	0.061703	0.058594	0.064679	0.061069	0.054143	0.056735	0.056500	0.057245	0.071536	0.028111
ZYX	0.064447	0.086215	0.068612	0.069678	0.139921	0.201745	0.144557	0.113513	0.068700	0.076388	0.048464	0.068847
ZZZ3	0.050390	0.053733	0.058255	0.052366	0.044569	0.050164	0.055113	0.058357	0.046104	0.068180	0.062864	0.036702

5237 rows × 12 columns

%%time
results_df1 = st.tl.paired_binomial_glm(
    det_rate_df, 
    adata, 
    column="sample_celltype", 
    test_condition="NK_cells", 
    reference_condition="Myeloid_cells", 
    condition_column="celltype_lvl1",
)
results_df1

CPU times: user 22.3 s, sys: 15.9 ms, total: 22.3 s
Wall time: 22.3 s

	beta	se	odds_ratio	pval	perfect_separation	error	padj	-log10(padj)	log2(odds_ratio)
gene
IFI30	-4.339651	0.090796	0.013041	0.000000e+00	False	None	0.000000e+00	9.0	-6.260793
TIMP1	-4.036057	0.084280	0.017667	0.000000e+00	False	None	0.000000e+00	9.0	-5.822799
LYZ	-3.551084	0.079467	0.028694	0.000000e+00	False	None	0.000000e+00	9.0	-5.123131
PSAP	-3.309378	0.073933	0.036539	0.000000e+00	False	None	0.000000e+00	9.0	-4.774423
THBS1	-3.108490	0.072474	0.044668	0.000000e+00	False	None	0.000000e+00	9.0	-4.484603
...	...	...	...	...	...	...	...	...	...
CD247	2.019324	0.078699	7.533233	3.378460e-145	False	None	5.529061e-143	9.0	2.913269
GZMB	2.077770	0.099078	7.986641	1.206773e-97	False	None	1.128548e-95	9.0	2.997589
NKG7	2.233148	0.082191	9.329190	1.460050e-162	False	None	3.058512e-160	9.0	3.221752
CCL5	2.563929	0.077387	12.986745	1.047344e-240	False	None	3.917816e-238	9.0	3.698968
GNLY	3.470331	0.078625	32.147383	0.000000e+00	False	None	0.000000e+00	9.0	5.006629

5237 rows × 9 columns

fig, ax = st.pl.paired_binomial_glm_volcano(results_df1)
plt.show()

../_images/246651fe55ab7a9a44ffcc0d9ad93d8e1dc8a8f77d34889b5eb6620877f80dcc.png

05b - using pyDESeq2#

counts = st.pp.pseudobulk(adata, "sample_celltype") 
counts

	s1_T_cells	s1_Myeloid_cells	s1_NK_cells	s1_B_cells	s2_T_cells	s2_Myeloid_cells	s2_NK_cells	s2_B_cells	s3_T_cells	s3_B_cells	s3_Myeloid_cells	s3_NK_cells
A1BG	917	198	88	56	550	355	259	61	800	81	215	94
A2M	135	25	14	5	59	29	35	8	73	10	14	10
AAAS	272	42	31	18	158	70	57	25	203	19	34	23
AAK1	436	72	45	7	227	80	98	15	326	19	52	57
AAMP	256	69	28	21	143	72	47	25	175	18	44	24
...	...	...	...	...	...	...	...	...	...	...	...	...
ZSWIM6	153	43	13	11	67	69	36	6	88	10	38	13
ZWINT	188	27	19	7	79	28	46	10	147	12	30	12
ZYG11B	353	57	33	10	181	97	85	25	227	21	56	35
ZYX	371	147	40	24	201	208	101	17	374	28	185	70
ZZZ3	292	58	27	13	171	67	76	22	234	25	46	36

5237 rows × 12 columns

%%time
results_df2 = st.tl.pydeseq2(
    counts, 
    adata, 
    column="sample_celltype", 
    test_condition="NK_cells", 
    reference_condition="Myeloid_cells", 
    condition_column="celltype_lvl1",
)
results_df2

Using None as control genes, passed at DeseqDataSet initialization

Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.51 seconds.

Fitting dispersion trend curve...
/home/siebrenf/miniconda3/envs/stampede/lib/python3.14/site-packages/pydeseq2/dds.py:822: UserWarning: The dispersion trend curve fitting did not converge. Switching to a mean-based dispersion trend.
  self._fit_parametric_dispersion_trend(vst)
... done in 0.11 seconds.

Fitting MAP dispersions...
... done in 0.51 seconds.

Fitting LFCs...
... done in 0.51 seconds.

Calculating cook's distance...
... done in 0.01 seconds.

Replacing 0 outlier genes.

Running Wald tests...

Log2 fold change & Wald test p-value: celltype_lvl1 NK_cells vs Myeloid_cells
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
A1BG    200.158056       -0.439831  0.150883 -2.915053  0.003556  0.021761
A2M      22.222461        0.166992  0.297768  0.560814  0.574924  0.787363
AAAS     51.730993        0.011885  0.219241  0.054211  0.956767  0.982888
AAK1     68.129857        0.377736  0.189506  1.993267  0.046232  0.170509
AAMP     53.521286       -0.496203  0.227030 -2.185632  0.028843  0.120262
...            ...             ...       ...       ...       ...       ...
ZSWIM6   30.811360       -0.894070  0.255361 -3.501200  0.000463  0.003743
ZWINT    30.131072        0.201892  0.270985  0.745032  0.456252  0.698725
ZYG11B   61.445199       -0.070701  0.197421 -0.358122  0.720252  0.875431
ZYX     103.154574       -0.952321  0.226699 -4.200818  0.000027  0.000275
ZZZ3     57.277142        0.076049  0.202864  0.374878  0.707751  0.869661

[5237 rows x 6 columns]
CPU times: user 3.09 s, sys: 120 ms, total: 3.21 s
Wall time: 4.09 s

... done in 0.46 seconds.

	baseMean	log2FoldChange	lfcSE	stat	pvalue	padj
A1BG	200.158056	-0.439831	0.150883	-2.915053	0.003556	0.021761
A2M	22.222461	0.166992	0.297768	0.560814	0.574924	0.787363
AAAS	51.730993	0.011885	0.219241	0.054211	0.956767	0.982888
AAK1	68.129857	0.377736	0.189506	1.993267	0.046232	0.170509
AAMP	53.521286	-0.496203	0.227030	-2.185632	0.028843	0.120262
...	...	...	...	...	...	...
ZSWIM6	30.811360	-0.894070	0.255361	-3.501200	0.000463	0.003743
ZWINT	30.131072	0.201892	0.270985	0.745032	0.456252	0.698725
ZYG11B	61.445199	-0.070701	0.197421	-0.358122	0.720252	0.875431
ZYX	103.154574	-0.952321	0.226699	-4.200818	0.000027	0.000275
ZZZ3	57.277142	0.076049	0.202864	0.374878	0.707751	0.869661

5237 rows × 6 columns

fig, axs = st.pl.pydeseq2_volcano(results_df2)
plt.show()

../_images/4675c4b9892017ffec075ac86ba44579653d2430c66f01435d61d2a40c3f536d.png