00 - Setup#

import os
import stampede as st
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt

from pydeseq2.ds import DeseqStats
from pydeseq2.dds import DeseqDataSet
# dictionary with input files per slide
slides = {
    1: {
        "exprmat": "exprMat_file.csv.gz",
        "metadata": "metadata_file.csv.gz",
        "fov_positions": "fov_positions_file.csv.gz",
    }
}
# optional filepath prefix
data_dir = "data"
os.makedirs(st.config["adata_dir"], exist_ok=True)

01 - Read#

# table mapping the FOVs per slide to each sample
# column "fovs" accepts any combination of comma separated values (e.g. 1,2,3) and ranges (e.g. 4-6)
samples_df = pd.read_table("data/sample2fov.csv", sep=",")
st.validate_input(slides, samples_df, data_dir)
samples_df
sample fovs slide
0 s1 10-18 1
1 s2 19-27 1
2 s3 28-36 1
# combine the input files into a single cell object and write it to file
adata_file = os.path.join(st.config["adata_dir"], "raw_data.h5ad")
st.read_cosmx(
    slides,
    samples_df,
    adata_file,
    data_dir=data_dir,
    overwrite=False,
    verbose=True,
)
adata_file already exists and overwrite=False
'adatas/raw_data.h5ad'
adata = sc.read_h5ad(adata_file)
adata.obs
fov cell_ID slide slide-fov Area AspectRatio CenterX_local_px CenterY_local_px Width Height ... nCount nCountPerCell nFeaturePerCell propNegativeCellAvg complexityCellAvg errorCtPerCellEstimate percOfDataFromErrorPerCell qcFlagsFOV sample fovs
slide-fov-cell_ID
1-10-1 10 1 1 1-10 4260 0.97 2902 613 77 75 ... 140803 863.822086 670.730061 0.000315 1.245668 87.131902 0.100868 Pass s1 10-18
1-10-2 10 2 1 1-10 5228 0.95 3007 623 83 87 ... 140803 863.822086 670.730061 0.000315 1.245668 87.131902 0.100868 Pass s1 10-18
1-10-3 10 3 1 1-10 3204 0.89 2947 649 71 63 ... 140803 863.822086 670.730061 0.000315 1.245668 87.131902 0.100868 Pass s1 10-18
1-10-4 10 4 1 1-10 22940 0.81 2833 746 191 237 ... 140803 863.822086 670.730061 0.000315 1.245668 87.131902 0.100868 Pass s1 10-18
1-10-5 10 5 1 1-10 4404 0.76 2990 691 93 71 ... 140803 863.822086 670.730061 0.000315 1.245668 87.131902 0.100868 Pass s1 10-18
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1-36-271 36 271 1 1-36 4264 0.88 1812 2281 81 71 ... 110510 401.854545 338.160000 0.000438 1.168196 46.031818 0.114548 Pass s3 28-36
1-36-272 36 272 1 1-36 5024 0.79 397 3328 87 69 ... 110510 401.854545 338.160000 0.000438 1.168196 46.031818 0.114548 Pass s3 28-36
1-36-273 36 273 1 1-36 5460 0.98 323 3370 83 81 ... 110510 401.854545 338.160000 0.000438 1.168196 46.031818 0.114548 Pass s3 28-36
1-36-274 36 274 1 1-36 1888 0.96 455 3372 47 49 ... 110510 401.854545 338.160000 0.000438 1.168196 46.031818 0.114548 Pass s3 28-36
1-36-275 36 275 1 1-36 3880 0.97 398 3397 69 67 ... 110510 401.854545 338.160000 0.000438 1.168196 46.031818 0.114548 Pass s3 28-36

27625 rows × 79 columns

02 - QC#

02a - Slide QC#

fig, ax = st.pl.ncell_per_condition(adata, "sample")
plt.show()
../_images/a26202d31d9503c67505b6848bbd5eae843426d019634290cae69045af847aac.png
st.pp.slide_qc(adata, slides, ["sample"], data_dir=data_dir)
adata.uns["fov_metadata"].head()
slide fov x y nCounts nCell mean_CountsPerCell nCount_negprobes mean_NegProbe-CountsPerCell nCount_falsecode mean_FalseCode-CountsPerCell mean_CellSize sample
slide-fov
1-10 1 10 19773 149405 140803 163 863.822086 46 0.282209 480 2.944785 63.957736 s1
1-11 1 11 24029 149405 752088 848 886.896226 274 0.323113 2569 3.029481 54.344592 s1
1-12 1 12 28285 149405 50213 47 1068.361702 21 0.446809 151 3.212766 74.681880 s1
1-13 1 13 19773 145149 1241593 1866 665.376742 521 0.279207 5842 3.130761 52.223075 s1
1-14 1 14 24029 145149 2638265 3391 778.019758 916 0.270127 10325 3.044825 52.564838 s1
# this dataset was subset from a much larger slide
# the plot limits in this tutorial are extended to provide a sense of scale
fig, axs = st.pl.slide_qc(
    adata, columns=["sample", "nCell", "mean_CountsPerCell"], figsize=(2.5, 0.5), legend_kwargs={"ncols": 3}
)
ylim = axs[1, 0].get_ylim()
axs[1, 0].set_ylim(ylim[0]-10_000, ylim[1]+20_000)
plt.show()

fig, axs = st.pl.slide_qc(
    adata, columns=["mean_NegProbe-CountsPerCell", "mean_FalseCode-CountsPerCell", "mean_CellSize"], figsize=(2.5, 0.5)
)
ylim = axs[1, 0].get_ylim()
axs[1, 0].set_ylim(ylim[0]-10_000, ylim[1]+20_000)
plt.show()
../_images/21032582737cc9add762a797300b0ae7ef9aa2ad2ffebf07dba1888975029d2a.png ../_images/c4a8963596ea02016b77f0adacc7ba4872b962b3b221f34a95d5f23d77f09e37.png

Visualizing Field of View edge-effects#

Cells on the edges of an Field of View (FOV) may have been truncated. Furthermore, we tend to detect less RNA in them. There may also be a bias to the corners and/or one or two edges in the dataset.

# plot the average nCount_RNA per pixel over all FOVs
fig, axs = st.pl.avg_per_pixel(
    adata, column='nCount_RNA', normalize_cell_area=True
)
plt.show()

# plot the cell area for all FOVs. 
# (do not normalize for cell area when plotting the cell area)
fig, axs = st.pl.avg_per_pixel(
    adata, column='Area.um2', normalize_cell_area=False
)
plt.show()
../_images/a7adcfeaf0cb005076b60c558b96ecd6e85752128c2d8c683b3dd562898611fe.png ../_images/6e13160054ef02692720bfc64cdda20dcc5919054b603511c8375f9036d2df31.png
# visualize the effect of removing cells based on distance to FOV edge
fig, axs = st.pl.correlations(adata, xcolumn="dist2edge_px", ycolumn="nCount_RNA", max_quantile=0.7)

dist2edge_threshold = 50
print(f"{dist2edge_threshold=}")
axs[1, 1].axvline(dist2edge_threshold, color="red")
plt.show()
dist2edge_threshold=50
../_images/95bc5033debe908348ca28d0f6ea6c657dafdfd5a8869c61f7b90cbf1ac62c36.png
adata = st.pp.filter_edges(adata, dist2edge_threshold)
1_729 cells filtered out, 25_896 cell remaining.

02b - gene QC#

# remove non-PBMC genes
fname = os.path.join(data_dir, "blacklist_pbmc_6k.txt")
blacklist = pd.read_table(fname, header=None)[0].to_list()

before = len(adata.var)
adata = adata[:, ~adata.var.index.isin(blacklist)].copy()
after = len(adata.var)
print(before - after, "genes filtered out")
904 genes filtered out
# mark core immune genes
fname = os.path.join(data_dir, "core_immune_genes_6k.txt")
whitelist = pd.read_table(fname, header=None)[0].to_list()
adata.var["is_core_immune_gene"] = adata.var.index.isin(whitelist)
# determine a noise threshold for gene expression using the mean expression 
# and standard deviation of the negative control probes
st.pp.gene_qc(adata)
adata.var
is_core_immune_gene is_negctrl is_sysctrl nCell pctCell nTranscript mean_Transcript above_noise
A1BG False False False 5625 21.721501 6883 0.265794 True
A2M True False False 635 2.452116 658 0.025409 True
AAAS False False False 1416 5.468026 1496 0.057770 True
AAK1 True False False 2175 8.398981 2363 0.091250 True
AAMP False False False 1483 5.726753 1591 0.061438 True
... ... ... ... ... ... ... ... ...
ZSWIM6 True False False 854 3.297807 888 0.034291 True
ZWINT True False False 969 3.741891 1019 0.039350 True
ZYG11B False False False 1855 7.163268 2002 0.077309 True
ZYX True False False 2650 10.233241 3012 0.116311 True
ZZZ3 True False False 1606 6.201730 1709 0.065995 True

5614 rows × 8 columns

fig, ax = st.pl.noise_threshold(adata, max_quantile=0.95)
plt.show()
../_images/d023f4b9e4f9bf179be7683ecd8d2f52148d300c2b6a462174d8f728b896aa20.png
adata = st.pp.filter_genes(adata)
377 genes filtered out, 5_237 genes remaining.

02c - cell QC#

fig, axs = st.pl.violin(
    adata, 
    ["nCount_RNA", "nFeature_RNA", "Area.um2", "nCount_negprobes", "nCount_falsecode"],
    log_scale=(True, True, True, False, False)
)
plt.show()
../_images/7d9054dd1bb8f7f41786d40bf1ac373d972b5c98c249f53eefa41823da777d98.png
adata = st.pp.filter_cells(
    adata, 
    falsecode_max = 5,
    negprobe_max = 2,
    ntranscript_min = 250,
    ntranscript_max = 1500,
    area_min = 25,
    area_max = 110,
)
9_123 cells filtered out, 16_773 cells remaining.
st.pp.gene_qc_postfilter(adata)
adata.var[["nCell", "nCell_postfilter", "pctCell", "pctCell_postfilter"]]
nCell nCell_postfilter pctCell pctCell_postfilter
A1BG 5625 3674 21.721501 21.90
A2M 635 417 2.452116 2.49
AAAS 1416 952 5.468026 5.68
AAK1 2175 1434 8.398981 8.55
AAMP 1483 922 5.726753 5.50
... ... ... ... ...
ZSWIM6 854 547 3.297807 3.26
ZWINT 969 605 3.741891 3.61
ZYG11B 1855 1180 7.163268 7.04
ZYX 2650 1766 10.233241 10.53
ZZZ3 1606 1067 6.201730 6.36

5237 rows × 4 columns

st.pp.cell_qc_postfilter(adata)
adata.obs[["nFeature_RNA", "nFeature_RNA_postfilter", "nCount_RNA", "nCount_RNA_postfilter"]]
nFeature_RNA nFeature_RNA_postfilter nCount_RNA nCount_RNA_postfilter
slide-fov-cell_ID
1-10-1 627 560 821 748
1-10-2 873 771 1185 1067
1-10-6 648 573 838 748
1-10-8 831 716 1015 888
1-10-9 666 583 823 730
... ... ... ... ...
1-36-269 750 643 900 786
1-36-270 725 629 863 754
1-36-272 847 718 1033 894
1-36-273 814 711 989 871
1-36-275 660 562 782 676

16773 rows × 4 columns

fig, ax = st.pl.ncell_per_condition(adata, columns = ["sample"])
plt.show()
../_images/1f03e1b495ab2e3a0edbb8c74429140f97305e64b070b45f32696dfb37abd4d3.png

03 - Dimensionality reduction#

st.pp.binarize(adata)
binary layer set as adata.X
st.pp.dim_red(adata, use_genes="is_core_immune_gene")
fig, ax = st.pl.scree(adata)
plt.show()
../_images/a5ad4eaede38027cbf538774d4a2370d0127c1cfad9404d2d4c964c5bfd5fec9.png
fig_ax_list = st.pl.dim_red(
    adata, 
    columns=["sample"],
    subset_size=1000,
)
plt.show()
../_images/96addafce410beeb94861757737551479edb82edabddf9879c41adb6489cf3c6.png
sc.pp.neighbors(
    adata,
    use_rep="X_svd",
    key_added="neighbors_svd",
)

sc.tl.umap(
    adata,
    neighbors_key="neighbors_svd",
    key_added="umap_svd",
)

sc.pl.embedding(
    adata,
    basis="umap_svd",
    color="sample",
    alpha=0.5,
)
../_images/13d6be289aa4c55a177d68fd37ae53cafa984707d5e1b953ae6fb599759372b7.png

04 - Clustering#

st.pp.knn_count_smoothing(
    adata,
    neighbors_key="neighbors_svd",
)
KNN_binary_mean layer set as adata.X
markerdict = {
    "T_cells": [
        "CD3D", "TRBC1","IL7R", "CD8B",
    ],
    "B_cells": [
        "MS4A1", "CD79A", "CD79B", "CD37"
    ],
    "NK_cells": [
        "NKG7", "GNLY", "KLRD1", "FCGR3A"
    ],
    "Monocytes_Macrophages": [
        "LYZ", "S100A8", "CTSD", "CD14",
    ],
}
for ctype, markers in markerdict.items():
    print(ctype)
    sc.pl.embedding(
        adata,
        basis="umap_svd",
        color=markers,
        layer="KNN_binary_mean",
        ncols=4,
        alpha=0.5,
    )
T_cells
../_images/de72ca37f88bc1825abb2f48fca3ad9f5cde96228a208bf54fd0c4bf4affbf7b.png
B_cells
../_images/b44db9b82f9b89e3a5f3291aae8351f169e834f289338b9457fef4b04c12d281.png
NK_cells
../_images/53e24936bba54160d3f5bdf93aefc9226d90e4345a3135e9746ca04b5cc7ac08.png
Monocytes_Macrophages
../_images/73a0111bea716aa5695c9bc22cce7a064b9002ff68d3f31588ace2a3bf73fc64.png
res = 0.5
sc.tl.leiden(
    adata,
    resolution=res,
    neighbors_key="neighbors_svd",
    key_added=f"leiden_res_{res:.2f}",
    flavor="igraph",
    n_iterations=2,
    directed=False,
    random_state=42,
)
sc.tl.rank_genes_groups(adata, f"leiden_res_{res:.2f}", n_genes=6, layer="binary")
sc.tl.dendrogram(adata, f"leiden_res_{res:.2f}", use_rep="X_svd")
sc.pl.rank_genes_groups_matrixplot(
    adata,
    n_genes=6,
)
WARNING: It seems you use rank_genes_groups on the raw count data. Please logarithmize your data before calling rank_genes_groups.
../_images/974ea3d8f5898dbf166aab669e27540b07828abeae4c419117f5bd47c492eaa4.png
cluster2ctype = {
    "0": "T_cells",
    "1": "NK_cells",
    "2": "B_cells",
    "3": "Myeloid_cells",
}
adata.obs["celltype_lvl1"] = adata.obs[f"leiden_res_{res:.2f}"].astype(str).replace(cluster2ctype).astype("category")
sc.pl.embedding(
    adata,
    basis="umap_svd",
    color="celltype_lvl1",
    alpha=0.5,
)
../_images/47341571b8a7390a033b426c1e38f10fe3f733d22606697f956f5d99be5a4607.png

05 - Differential expression analysis#

st.pp.combine_obs_columns(adata, ["sample", "celltype_lvl1"], "sample_celltype")
adata.obs["sample_celltype"].value_counts().head()
sample_celltype
s1_T_cells          4003
s3_T_cells          3912
s2_T_cells          2483
s2_NK_cells         1359
s2_Myeloid_cells    1351
Name: count, dtype: int64

05a - using detection rates & pbGLM#

det_rate_df = st.pp.detection_rates(adata, "sample_celltype")
det_rate_df
s1_T_cells s3_T_cells s2_T_cells s2_NK_cells s2_Myeloid_cells s3_Myeloid_cells s1_Myeloid_cells s3_NK_cells s1_NK_cells s3_B_cells s2_B_cells s1_B_cells
A1BG 0.166938 0.186624 0.192226 0.180092 0.241642 0.234460 0.198680 0.152472 0.154464 0.222320 0.177528 0.168530
A2M 0.022996 0.016689 0.019951 0.024067 0.019234 0.015267 0.023463 0.016206 0.023768 0.027226 0.022711 0.013955
AAAS 0.046860 0.046575 0.053780 0.039238 0.046576 0.037077 0.039669 0.037278 0.053029 0.051782 0.071536 0.051186
AAK1 0.076153 0.075050 0.077622 0.067599 0.053272 0.056707 0.068781 0.092419 0.077461 0.051782 0.042722 0.019593
AAMP 0.044045 0.040120 0.048625 0.032338 0.047914 0.047983 0.065840 0.038899 0.047833 0.049051 0.071536 0.059978
... ... ... ... ... ... ... ... ... ... ... ... ...
ZSWIM6 0.026101 0.020126 0.022668 0.024756 0.045907 0.041439 0.040629 0.021068 0.022061 0.027226 0.017018 0.030966
ZWINT 0.032164 0.033675 0.026749 0.031648 0.018569 0.032715 0.025359 0.019447 0.032328 0.032678 0.028416 0.019593
ZYG11B 0.061227 0.052116 0.061703 0.058594 0.064679 0.061069 0.054143 0.056735 0.056500 0.057245 0.071536 0.028111
ZYX 0.064447 0.086215 0.068612 0.069678 0.139921 0.201745 0.144557 0.113513 0.068700 0.076388 0.048464 0.068847
ZZZ3 0.050390 0.053733 0.058255 0.052366 0.044569 0.050164 0.055113 0.058357 0.046104 0.068180 0.062864 0.036702

5237 rows × 12 columns

%%time
results_df1 = st.tl.paired_binomial_glm(
    det_rate_df, 
    adata, 
    column="sample_celltype", 
    test_condition="NK_cells", 
    reference_condition="Myeloid_cells", 
    condition_column="celltype_lvl1",
)
results_df1
CPU times: user 22.3 s, sys: 15.9 ms, total: 22.3 s
Wall time: 22.3 s
beta se odds_ratio pval perfect_separation error padj -log10(padj) log2(odds_ratio)
gene
IFI30 -4.339651 0.090796 0.013041 0.000000e+00 False None 0.000000e+00 9.0 -6.260793
TIMP1 -4.036057 0.084280 0.017667 0.000000e+00 False None 0.000000e+00 9.0 -5.822799
LYZ -3.551084 0.079467 0.028694 0.000000e+00 False None 0.000000e+00 9.0 -5.123131
PSAP -3.309378 0.073933 0.036539 0.000000e+00 False None 0.000000e+00 9.0 -4.774423
THBS1 -3.108490 0.072474 0.044668 0.000000e+00 False None 0.000000e+00 9.0 -4.484603
... ... ... ... ... ... ... ... ... ...
CD247 2.019324 0.078699 7.533233 3.378460e-145 False None 5.529061e-143 9.0 2.913269
GZMB 2.077770 0.099078 7.986641 1.206773e-97 False None 1.128548e-95 9.0 2.997589
NKG7 2.233148 0.082191 9.329190 1.460050e-162 False None 3.058512e-160 9.0 3.221752
CCL5 2.563929 0.077387 12.986745 1.047344e-240 False None 3.917816e-238 9.0 3.698968
GNLY 3.470331 0.078625 32.147383 0.000000e+00 False None 0.000000e+00 9.0 5.006629

5237 rows × 9 columns

fig, ax = st.pl.paired_binomial_glm_volcano(results_df1)
plt.show()
../_images/246651fe55ab7a9a44ffcc0d9ad93d8e1dc8a8f77d34889b5eb6620877f80dcc.png

05b - using pyDESeq2#

counts = st.pp.pseudobulk(adata, "sample_celltype") 
counts
s1_T_cells s1_Myeloid_cells s1_NK_cells s1_B_cells s2_T_cells s2_Myeloid_cells s2_NK_cells s2_B_cells s3_T_cells s3_B_cells s3_Myeloid_cells s3_NK_cells
A1BG 917 198 88 56 550 355 259 61 800 81 215 94
A2M 135 25 14 5 59 29 35 8 73 10 14 10
AAAS 272 42 31 18 158 70 57 25 203 19 34 23
AAK1 436 72 45 7 227 80 98 15 326 19 52 57
AAMP 256 69 28 21 143 72 47 25 175 18 44 24
... ... ... ... ... ... ... ... ... ... ... ... ...
ZSWIM6 153 43 13 11 67 69 36 6 88 10 38 13
ZWINT 188 27 19 7 79 28 46 10 147 12 30 12
ZYG11B 353 57 33 10 181 97 85 25 227 21 56 35
ZYX 371 147 40 24 201 208 101 17 374 28 185 70
ZZZ3 292 58 27 13 171 67 76 22 234 25 46 36

5237 rows × 12 columns

%%time
results_df2 = st.tl.pydeseq2(
    counts, 
    adata, 
    column="sample_celltype", 
    test_condition="NK_cells", 
    reference_condition="Myeloid_cells", 
    condition_column="celltype_lvl1",
)
results_df2
Using None as control genes, passed at DeseqDataSet initialization
Fitting size factors...
... done in 0.00 seconds.

Fitting dispersions...
... done in 0.51 seconds.

Fitting dispersion trend curve...
/home/siebrenf/miniconda3/envs/stampede/lib/python3.14/site-packages/pydeseq2/dds.py:822: UserWarning: The dispersion trend curve fitting did not converge. Switching to a mean-based dispersion trend.
  self._fit_parametric_dispersion_trend(vst)
... done in 0.11 seconds.

Fitting MAP dispersions...
... done in 0.51 seconds.

Fitting LFCs...
... done in 0.51 seconds.

Calculating cook's distance...
... done in 0.01 seconds.

Replacing 0 outlier genes.

Running Wald tests...
Log2 fold change & Wald test p-value: celltype_lvl1 NK_cells vs Myeloid_cells
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
A1BG    200.158056       -0.439831  0.150883 -2.915053  0.003556  0.021761
A2M      22.222461        0.166992  0.297768  0.560814  0.574924  0.787363
AAAS     51.730993        0.011885  0.219241  0.054211  0.956767  0.982888
AAK1     68.129857        0.377736  0.189506  1.993267  0.046232  0.170509
AAMP     53.521286       -0.496203  0.227030 -2.185632  0.028843  0.120262
...            ...             ...       ...       ...       ...       ...
ZSWIM6   30.811360       -0.894070  0.255361 -3.501200  0.000463  0.003743
ZWINT    30.131072        0.201892  0.270985  0.745032  0.456252  0.698725
ZYG11B   61.445199       -0.070701  0.197421 -0.358122  0.720252  0.875431
ZYX     103.154574       -0.952321  0.226699 -4.200818  0.000027  0.000275
ZZZ3     57.277142        0.076049  0.202864  0.374878  0.707751  0.869661

[5237 rows x 6 columns]
CPU times: user 3.09 s, sys: 120 ms, total: 3.21 s
Wall time: 4.09 s
... done in 0.46 seconds.
baseMean log2FoldChange lfcSE stat pvalue padj
A1BG 200.158056 -0.439831 0.150883 -2.915053 0.003556 0.021761
A2M 22.222461 0.166992 0.297768 0.560814 0.574924 0.787363
AAAS 51.730993 0.011885 0.219241 0.054211 0.956767 0.982888
AAK1 68.129857 0.377736 0.189506 1.993267 0.046232 0.170509
AAMP 53.521286 -0.496203 0.227030 -2.185632 0.028843 0.120262
... ... ... ... ... ... ...
ZSWIM6 30.811360 -0.894070 0.255361 -3.501200 0.000463 0.003743
ZWINT 30.131072 0.201892 0.270985 0.745032 0.456252 0.698725
ZYG11B 61.445199 -0.070701 0.197421 -0.358122 0.720252 0.875431
ZYX 103.154574 -0.952321 0.226699 -4.200818 0.000027 0.000275
ZZZ3 57.277142 0.076049 0.202864 0.374878 0.707751 0.869661

5237 rows × 6 columns

fig, axs = st.pl.pydeseq2_volcano(results_df2)
plt.show()
../_images/4675c4b9892017ffec075ac86ba44579653d2430c66f01435d61d2a40c3f536d.png