Intro to Spatial Data

Introduction

In this notebook we will introduce loading, pre- and post-processing, clustering and cell typing for a VisiumHD dataset.

Loading data

Here we load an already processed object downloaded from 10X genomics:

Warning

The full dataset is quite large, and so the next few code chunks are for demonstration purposes only. We will subset this object, which is what you will load later. Skip to the hands-on section to load the zoomed in object and proceed.

Expand the code below to see how you would load and pre-process output from VisiumHD before saving a Seurat object.

Code
# Set the directory where the data are
dirs <- c("raw_data/Visium_HD_Breast_dataset_CRI/Output_Files/")

# Path to save individual processed Seurat objects
outs_path <- "objects/"

# Load individual sample
object <- Load10X_Spatial(data.dir = dirs, bin.size = c(8))

# Assign metadata
object$orig.ident <- "Visium_10X_breast_cancer"
object$sample_name <- "Visium_DCIS"
Idents(object) <- "sample_name"

# Compute mitochondrial percentage
object[["MT.percent"]] <- PercentageFeatureSet(object, pattern = "^MT-")

# Perform some initial data normalization and find variable features
DefaultAssay(object) <- "Spatial.008um"
object <- NormalizeData(object)

# Save the full object
saveRDS(object, file.path(outs_path, "Visium_DCIS.rds"))

For the purposes of the tutorial we subset the dataset to smaller section of tissue to save size and computational time. This code is not relevant to the course, but is available below if you are interested

Code
# Coordinates to zoom in to
coordinates <- GetTissueCoordinates(object)

min_x <- 14000
max_x <- 17500
min_y <- 15000
max_y <- 18500

subset_condition <- (coordinates$x > min_x) & (coordinates$y > min_y) & (coordinates$y < max_y) & (coordinates$x < max_x)

DCIS_zoom <- subset(object, cells = which(subset_condition))

# Remove completely empty bins
DCIS_zoom <- DCIS_zoom[, DCIS_zoom$nCount_Spatial.008um > 0]

# Update the seurat object to ensure object validity
DCIS_zoom <- UpdateSeuratObject(DCIS_zoom)

saveRDS(DCIS_zoom, file = "objects/Visium_DCIS_zoomed.rds")

Hands On Section

We will start the hands on section here by loading the zoomed in VisiumHD tissue section prior to further downstream and plotting

object <- readRDS("objects/Visium_DCIS_zoomed.rds")

Let’s take a look at this object

object
An object of class Seurat 
18085 features across 19823 samples within 1 assay 
Active assay: Spatial.008um (18085 features, 0 variable features)
 2 layers present: counts, data
 1 spatial field of view present: slice1.008um

Let’s also set up some defaults that can be used throughout the downstream processing

# This sets the size of points in the spatial plots
pt_size_factor <- 13

# Color defaults for clusters
distinct_palette <- c(
  "0" = "#1f77b4",
  "1" = "#ff7f0e",
  "2" = "#2ca02c",
  "3" = "#d62728",
  "4" = "#9467bd",
  "5" = "#8c564b",
  "6" = "#ff99cc",
  "7" = "#7f7f7f",
  "8" = "#bcbd22",
  "9" = "#17becf",
  "10" = "#393b79",
  "11" = "#637939",
  "12" = "#8c6d31"
)

# Cell type colors
celltype_cols <- c(
  "CD8.T.cells" = "green4",
  "CD4.T.cells" = "blue",
  "B.cell" = "#F781BF",
  "Plasma" = "#FF7F00",
  "Monocytes" = "purple",
  "Macrophages" = "#b15928",
  "DCs" = "#17becf",
  "Endothelial.cells" = "#E41A1C",
  "CAF" = "#984ea3",
  "Basal_Myoepithelial" = "mediumseagreen",
  "Epithelial.cells" = "#fdbf6f",
  "Fibroblasts" = "darkgrey",
  "Luminal.progenitor" = "navy",
  "Luminal.mature" = "#bcbd22",
  "Tumor" = "magenta",
  "NK.cells" = "lavender",
  "Mast.cells" = "green2",
  "Adipocytes" = "lightgrey",
  "Other" = "lightgrey"
)

epi_type_cols <- c(
  "Basal_Myoepithelial" = "mediumseagreen",
  "Luminal.progenitor" = "navy",
  "Luminal.mature" = "#bcbd22",
  "Tumor" = "magenta"
)

We can plot some quality metrics like total counts, genes detected, and mitochondrial gene count per bin:

counts_violin <- VlnPlot(
  object,
  features = "nCount_Spatial.008um",
  pt.size = 0,
  raster = FALSE
) +
  NoLegend()

counts_spatial <- SpatialFeaturePlot(
  object,
  features = "nCount_Spatial.008um",
  pt.size.factor = pt_size_factor,
  image.alpha = 0.8,
  alpha = c(1, 1)
) +
  theme(legend.position = "right")

genes_violin <- VlnPlot(
  object,
  features = "nFeature_Spatial.008um",
  pt.size = 0,
  raster = FALSE
) +
  NoLegend()

genes_spatial <- SpatialFeaturePlot(
  object,
  features = "nFeature_Spatial.008um",
  pt.size.factor = pt_size_factor,
  image.alpha = 0.8,
  alpha = c(1, 1)
) +
  theme(legend.position = "right")

mt_violin <- VlnPlot(
  object,
  features = "MT.percent",
  pt.size = 0,
  raster = FALSE
) +
  NoLegend()

mt_spatial <- SpatialFeaturePlot(
  object,
  features = "MT.percent",
  pt.size.factor = pt_size_factor,
  max.cutoff = "q99",
  image.alpha = 0.8,
  alpha = c(1, 1)
) +
  theme(legend.position = "right")

# Combine the plots using 'patchwork'
p0 <- (counts_violin | counts_spatial) /
  (genes_violin | genes_spatial) /
  (mt_violin | mt_spatial)
p0

Quality filtering

Here we filter to keep only bins with less than 25% mitochondrial counts, 10 total counts, and 10 total genes detected per 8um bin.

# UMI and Gene Threshold
object$QCFilter <- ifelse(
  object$MT.percent < 25 &
    object$nCount_Spatial.008um > 10 &
    object$nFeature_Spatial.008um > 10,
  yes = "Keep", no = "Remove"
)

table(object$QCFilter)

  Keep Remove 
 19102    721 

Take a look at the low quality bins using SpatialDimPlot()

p1 <- SpatialDimPlot(object,
  group.by = "QCFilter",
  pt.size.factor = pt_size_factor,
  image.alpha = 0.2,
  alpha = c(1, 0.2)
) +
  guides(fill = guide_legend(override.aes = list(size = 4)))
counts_spatial | p1

Filter our object for only high quality bins

keep_cells <- colnames(object)[object$QCFilter == "Keep"]
object <- subset(object, cells = keep_cells)

# We find variable features and scale after quality filtering
object <- FindVariableFeatures(object)
object <- ScaleData(object)

Now take a look at our object

object
An object of class Seurat 
18085 features across 19102 samples within 1 assay 
Active assay: Spatial.008um (18085 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 1 spatial field of view present: slice1.008um

Clustering

Banksy

Above, we normalized the counts, and found the variable features as you would in a normal scRNA analysis workflow using Seurat. However, in spatial data, we use clustering methods like Banksy which make use of the spatial information when defining clusters. Check the documentation for the RunBanksy() command for more information.

# Run Banksy
object <- RunBanksy(object,
  lambda = 0.8,
  assay = "Spatial.008um",
  slot = "data",
  features = "variable",
  k_geom = 24,
  verbose = TRUE
)

If we look at the object we now see that the active assay is set to BANKSY

object
An object of class Seurat 
22085 features across 19102 samples within 2 assays 
Active assay: BANKSY (4000 features, 0 variable features)
 2 layers present: data, scale.data
 1 other assay present: Spatial.008um
 1 spatial field of view present: slice1.008um

Now we RunPCA() on the BANKSY assay.

object <- RunPCA(object,
  assay = "BANKSY",
  reduction.name = "pca.banksy",
  features = rownames(object),
  npcs = 20
)

Look at the elbow plot to help determine the k for RunPCA()

ElbowPlot(object, reduction = "pca.banksy")

Note

Normally we use the above plot to determine the number of dimensions to use for downstream kNN and clustering. Here, because we’ve subset it looks like we should pick fewer than the first 15 PCs, but we’ll keep 15 since it won’t make a difference in downstream analysis.

Similar to conventional scRNA analysis, we now run FindNeighbors(), FindClusters(), and then RunUMAP()

object <- FindNeighbors(object,
  reduction = "pca.banksy",
  dims = 1:15
)

object <- FindClusters(object,
  cluster.name = "banksy_cluster",
  resolution = 0.3
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 19102
Number of edges: 496475

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9067
Number of communities: 8
Elapsed time: 1 seconds
object <- RunUMAP(object,
  reduction = "pca.banksy",
  reduction.name = "umap.banksy",
  return.model = T,
  dims = 1:15
)
Idents(object) <- "banksy_cluster"

p2 <- SpatialDimPlot(object,
  group.by = "banksy_cluster",
  pt.size.factor = pt_size_factor,
  label = F,
  label.size = 5,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_fill_manual(values = distinct_palette) +
  ggtitle("Banksy Neighborhood Clustering") +
  theme(
    legend.position = "right",
    legend.key.size = unit(0.5, "cm")
  ) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

DefaultAssay(object) <- "BANKSY"
Idents(object) <- "banksy_cluster"

p3 <- DimPlot(object,
  reduction = "umap.banksy",
  label = TRUE,
  label.size = 2,
  raster = FALSE
) +
  scale_color_manual(values = distinct_palette) +
  ggtitle("Banksy Neighborhood Clustering") +
  theme(
    legend.position = "right",
    aspect.ratio = 1,
    legend.key.size = unit(0.5, "cm")
  ) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

final_plot <- wrap_plots(p2, p3, ncol = 2)
final_plot

Sketching and Projection

Seurat V5 introduced the concept of data sketching. Briefly, this approach will ‘sketch’ a representative subset of your dataset (importantly preserving rare cell types), prior to performing dimensional reduction, clustering, and computing a UMAP. It then ‘projects’ the rest of your data into this reduced space so that the complete dataset can be utilized in downstream analysis.

Note

In our example dataset we already subset to ~20,000 bins, so sketching here is just for demonstration of the code. In full VisiumHD datasets, which typically contain 500k or more bins, this is almost always required as the processing will take a very long time otherwise, and the results are typically comparable.

Warning

Unlike Banksy, this type of clustering is not spatially aware! It treats the data as you would a conventional scRNA dataset, not taking into account the spatial localization of different cell types. This isn’t inherently a bad thing, but it is important to know when interpreting the results.

DefaultAssay(object) <- "Spatial.008um"

# Sketch the dataset
object <- SketchData(
  object = object,
  ncells = 10000,
  features = VariableFeatures(object),
  method = "LeverageScore",
  sketched.assay = "sketch"
)

Now if we look at the object we see that the sketch assay is now active

object
An object of class Seurat 
40170 features across 19102 samples within 3 assays 
Active assay: sketch (18085 features, 2000 variable features)
 2 layers present: counts, data
 2 other assays present: Spatial.008um, BANKSY
 2 dimensional reductions calculated: pca.banksy, umap.banksy
 1 spatial field of view present: slice1.008um

We proceed analyzing as we typically would for an scRNA dataset

# switch analysis to sketched cells
DefaultAssay(object) <- "sketch"

# Perform the clustering workflow
object <- FindVariableFeatures(object)
object <- ScaleData(object)
object <- RunPCA(object, assay = "sketch", reduction.name = "pca.sketch")
ElbowPlot(object, reduction = "pca.sketch")

object <- FindNeighbors(object,
  assay = "sketch",
  reduction = "pca.sketch",
  dims = 1:15
)

object <- FindClusters(object,
  cluster.name = "seurat_cluster.sketched",
  resolution = 0.5
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 10000
Number of edges: 285667

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8110
Number of communities: 8
Elapsed time: 1 seconds
object <- RunUMAP(object,
  reduction = "pca.sketch",
  reduction.name = "umap.sketch",
  return.model = T,
  dims = 1:15
)

Now that we have run the processing on the ‘sketched’ data subset, we project the rest of the data into this space

object <- ProjectData(
  object = object,
  assay = "Spatial.008um",
  full.reduction = "full.pca.sketch",
  sketched.assay = "sketch",
  sketched.reduction = "pca.sketch",
  umap.model = "umap.sketch",
  dims = 1:15,
  refdata = list(seurat_cluster.projected = "seurat_cluster.sketched")
)
object
An object of class Seurat 
40170 features across 19102 samples within 3 assays 
Active assay: sketch (18085 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 2 other assays present: Spatial.008um, BANKSY
 6 dimensional reductions calculated: pca.banksy, umap.banksy, pca.sketch, umap.sketch, full.pca.sketch, full.umap.sketch
 1 spatial field of view present: slice1.008um
DefaultAssay(object) <- "sketch"
Idents(object) <- "seurat_cluster.sketched"

p4 <- DimPlot(object,
  reduction = "umap.sketch",
  label = T,
  raster = F
) +
  scale_color_manual(values = distinct_palette) +
  ggtitle("Sketched clustering (10,000 cells)") +
  theme(
    legend.key.size = unit(0.5, "cm"),
    aspect.ratio = 1
  ) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

# switch to full dataset
DefaultAssay(object) <- "Spatial.008um"
Idents(object) <- "seurat_cluster.projected"

p5 <- DimPlot(object,
  reduction = "full.umap.sketch",
  label = T,
  raster = F
) +
  scale_color_manual(values = distinct_palette) +
  ggtitle("Projected clustering (full dataset)") +
  theme(
    legend.key.size = unit(0.5, "cm"),
    aspect.ratio = 1
  ) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

final_plot <- wrap_plots(p4, p5, ncol = 2)
final_plot

Comparison

No we can compare the Banksy spatially aware clustering with the non-spatial aware ‘default’ Seurat clustering:

DefaultAssay(object) <- "Spatial.008um"
Idents(object) <- "seurat_cluster.projected"

p6 <- SpatialDimPlot(object,
  group.by = "seurat_cluster.projected",
  pt.size.factor = pt_size_factor,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_fill_manual(values = distinct_palette) +
  ggtitle("Projected clustering (full dataset)") +
  theme(legend.key.size = unit(0.5, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

final_plot <- wrap_plots(p2, p3, p6, p5, ncol = 2)
final_plot

Let’s compare the cell-to-cell overlap in Banksy versus conventional clustering assignments

# Here we create a cross-table of each cluster type
prop_table <- prop.table(
  table(
    "Banksy" = object$banksy_cluster,
    "Sketched" = object$seurat_cluster.projected
  ),
  margin = 1
)
ComplexHeatmap::Heatmap(prop_table,
  name = "Proportion",
  column_title = "Sketched",
  column_title_side = "bottom",
  row_title = "Banksy",
  row_title_side = "right",
  col = viridis::cividis(10)
)

Note

We can see that there is some overlap, but many differences as well. Why is this?

It is important to consider what we want out of spatial clustering:

  • Do we want discrete cell type clusters?
  • Do we want clusters of cell neighborhoods?

Many spatially aware clustering methods are being published as the field is relatively new, and technologies are constantly being improved. As such, there is not yet a gold standard approach to solving these problems. As you can appreciate, Banksy seems to preserve more of the tissue architecture, whereas the conventional Seurat approach does not utilize the spatial information. How exactly you process and interpret your own data will depend on the types of questions you seek to answer.

Cell type identification

Denovo marker identification

Now we’d like to identify the constituent cell types in our sample. This analysis proceeds according to the conventional workflow of selecting the clusters of interest, and running Seurat’s FindAllMarkers().

# Ensure the spatial count data is the active assay prior to proceeding
DefaultAssay(object) <- "Spatial.008um"
# Here we use the banksy clusters
Idents(object) <- "banksy_cluster"

banksy_markers <- FindAllMarkers(object,
  assay = "Spatial.008um",
  only.pos = T,
  logfc.threshold = 0.4,
  min.pct = 0.1,
  verbose = TRUE
)

Now we can plot a heatmap of the top 10 markers per cluster using Seurat’s DoHeatmap()

top_n_genes <- 5 # Select how many top genes to plot
# Select the top markers by cluster
top_genes <- banksy_markers %>%
  group_by(cluster) %>%
  dplyr::filter(p_val_adj < 0.05 & avg_log2FC > 1) %>%
  slice_head(n = top_n_genes) %>%
  ungroup()

# Scale the data for these genes
object <- ScaleData(object, assay = "Spatial.008um", features = top_genes$gene)

# Plot a heatmap using Seurat's DoHeatmap
p1 <- DoHeatmap(
  # We subsample our object when we plot to 100 cells per
  # banksy cluster, otherwise plot will not generate
  object = subset(object, downsample = 100),
  group.by = "banksy_cluster",
  group.colors = distinct_palette,
  assay = "Spatial.008um",
  features = top_genes$gene,
  raster = T,
  size = 2.5
) +
  scale_fill_viridis_c()
p1

We can also plot using Seurat’s DotPlot()

# Seurat's DotPlot function doesn't like duplicated genes across groups
top_n_genes <- 5 # Select how many top genes to plot
top_unique <- banksy_markers %>%
  group_by(cluster) %>%
  dplyr::filter(p_val_adj < 0.05 & avg_log2FC > 1) %>%
  slice_head(n = top_n_genes) %>%
  ungroup() %>%
  distinct(gene, .keep_all = T)

top_list <- split(top_unique$gene, top_unique$cluster)

p <- DotPlot(object, features = top_list, group.by = "banksy_cluster") +
  scale_color_gradient2(low = "blue", high = "red") +
  guides(x = guide_axis(angle = 90))
p

Using curated celltype markers

We can also load a manually curated marker list for cell types

marker_table <- read_excel("data/DCIS_genes.xlsx")
unique(marker_table$Cell.type)
 [1] "CD8.T.cells"         "CD4.T.cells"         "NK.cells"           
 [4] "B.cell"              "Plasma"              "Mast.cells"         
 [7] "Macrophages"         "Monocytes"           "DCs"                
[10] "Endothelial.cells"   "CAF"                 "Basal_Myoepithelial"
[13] "Epithelial.cells"    "Fibroblasts"         "Adipocytes"         
[16] "Tumor"               "Luminal.progenitor"  "Luminal.mature"     
head(marker_table)
# A tibble: 6 × 2
  Gene  Cell.type  
  <chr> <chr>      
1 CD8B  CD8.T.cells
2 CD8A  CD8.T.cells
3 CD3E  CD8.T.cells
4 CD3D  CD8.T.cells
5 CD4   CD4.T.cells
6 CD3E  CD4.T.cells

Using this marker list, we add a score for each cell type for each cell using Seurat’s AddModuleScore, assigning cell types based on the maximum score.

# We format the markers into a list
marker_list <- split(marker_table$Gene, marker_table$Cell.type)
head(marker_list)
$Adipocytes
 [1] "PNPLA2"  "CAV1"    "FABP4"   "PPARG"   "CEBPA"   "LEP"     "CIDEA"  
 [8] "SHOX2"   "SLC7A10" "SLC36A2" "P2RX5"  

$B.cell
[1] "BANK1" "CD79A" "CD74"  "MS4A1" "MEF2C" "CD19"  "CD79B"

$Basal_Myoepithelial
[1] "KRT14"    "DST"      "MMP7"     "MIR205HG" "MT1X"     "OXTR"     "KRT17"   
[8] "FST"     

$CAF
[1] "CFD"   "DCN"   "GSN"   "EBF1"  "PRKG1"

$CD4.T.cells
[1] "CD4"  "CD3E" "CD3D" "SELL" "CCR7" "IL7R" "TCF7" "LEF1"

$CD8.T.cells
[1] "CD8B" "CD8A" "CD3E" "CD3D"

Now we loop through each cell type within this list, and run Seurat’s AddModuleScore() to add a signature score for each cell type to the object.

# Add module scores to the Seurat object based on marker genes
for (cell_type in names(marker_list)) {
  object <- AddModuleScore(
    object = object,
    features = list(marker_list[[cell_type]]),
    name = paste0(cell_type, "_score")
  )
}

# AddModuleScore appends '1' to the end of each signature
# Here we just remove the '1' appended to the column names
colnames(object@meta.data) <- gsub(
  "_score1$", "_score",
  colnames(object@meta.data)
)

# Grep out the names of the signature score columns for later use
score_names <- grep(
  pattern = "_score",
  x = colnames(object@meta.data),
  value = T
)

Now we can see we’ve added these signature scores to our objects metadata slot

object@meta.data[1:5, score_names[1:5]]
                      Adipocytes_score B.cell_score Basal_Myoepithelial_score
s_008um_00594_00250-1      -0.04843953  -0.05159478               -0.08651620
s_008um_00549_00222-1      -0.04485982  -0.05245652               -0.12322605
s_008um_00543_00215-1      -0.05486628  -0.03945521               -0.13809411
s_008um_00623_00165-1      -0.02630046  -0.01333380               -0.08457084
s_008um_00622_00168-1      -0.02800082  -0.06598156               -0.11589853
                        CAF_score CD4.T.cells_score
s_008um_00594_00250-1 -0.06500176       -0.07717423
s_008um_00549_00222-1 -0.11179067       -0.04862343
s_008um_00543_00215-1 -0.13263827       -0.08021617
s_008um_00623_00165-1 -0.10047016       -0.03276049
s_008um_00622_00168-1 -0.12828592       -0.06733268

Now we ask, for each cell, what is the maximum scoring cell signature? Assuming this score is above a lower threshold, we assign the cell to that type

# Identify the cell type for each cell based on the highest module score
object$cell_type <- apply(
  X = object@meta.data[, grep("_score$", colnames(object@meta.data))],
  MARGIN = 1,
  FUN = function(scores) {
    # If the maximum score is <0.05 we don't assign a celltype
    if (max(scores) < 0.05) {
      return("Other")
    } else {
      return(names(scores)[which.max(scores)])
    }
  }
)

# Clean up the '_score' suffix from the cell_type column
object@meta.data[["cell_type"]] <- gsub("_score", "", object@meta.data[["cell_type"]])

# Convert to a factor, ordered by our cell type color ordering
object$cell_type <- factor(object$cell_type, levels = names(celltype_cols))

Now we can plot these cell type assignments

Idents(object) <- "cell_type"

p1 <- SpatialDimPlot(object,
  group.by = "cell_type",
  pt.size.factor = pt_size_factor,
  label = F,
  label.size = 3,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_fill_manual(values = celltype_cols) +
  ggtitle("Major Cell Types") +
  theme(legend.key.size = unit(0.1, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 3)))

p2 <- DimPlot(object,
  reduction = "umap.banksy",
  label = F,
  pt.size = 0.05,
  raster = F # For large datasets set to T
) +
  scale_color_manual(values = celltype_cols) +
  ggtitle("Major Cell Types") +
  theme(
    legend.key.size = unit(0.2, "cm"),
    aspect.ratio = 1
  )

final_plot <- wrap_plots(p1, p2, ncol = 2)
final_plot

We can look at the proportion of cell types within each Banksy cluster.

object@meta.data %>%
  ggplot(aes(x = banksy_cluster)) +
  geom_bar(aes(fill = cell_type), position = "fill") +
  scale_fill_manual(values = celltype_cols) +
  theme_classic() +
  guides(fill = guide_legend(keywidth = 0.5, keyheight = 0.5)) +
  labs(y = "Proportion", fill = "Cell Type")

Note

Remember these are cell neighborhood clusters, and as such contain a mixture of different cell types

Let’s plot the raw signature values over each assigned cell type.

celltypes_to_plot <- names(celltype_cols)
celltypes_to_plot <- celltypes_to_plot[celltypes_to_plot != "Other"]
celltype_score_vlnplots <- VlnPlot(
  object,
  features = paste0(celltypes_to_plot, "_score"),
  pt.size = 0,
  group.by = "cell_type",
  cols = celltype_cols,
  sort = T,
  combine = F # This will return a list instead of a patchwork
)

names(celltype_score_vlnplots) <- celltypes_to_plot

celltype_score_umapplots <- FeaturePlot(
  object = object,
  features = paste0(celltypes_to_plot, "_score"),
  reduction = "umap.banksy",
  pt.size = 0.05,
  combine = F,
  # This will compress the color scale to bring out cells
  min.cutoff = "q1",
  max.cutoff = "q99",
  # Bring the points with higher values on top
  order = T,
  raster = F # For large datasets set to T
)
names(celltype_score_umapplots) <- celltypes_to_plot

Now we can loop through and plot the signatures as violins and atop the Banksy UMAP

for (ct in celltypes_to_plot) {
  p1 <- celltype_score_umapplots[[ct]] +
    scale_color_viridis_c()

  p2 <- celltype_score_vlnplots[[ct]] +
    NoLegend()

  pcomb <- free(p1, type = "label", side = "b") +
    p2 + plot_layout(nrow = 1, widths = c(1, 2.5))
  print(pcomb)
}

We can also use lapply() to loop through each cell type and plot it on the image

celltype_plots <- lapply(X = celltypes_to_plot, FUN = function(ct) {
  # Subset object for the current cell type
  subsetobject <- object[, object$cell_type == ct]

  # Generate the SpatialDimPlot
  p <- SpatialDimPlot(
    subsetobject,
    group.by = "cell_type",
    pt.size.factor = pt_size_factor + 5, # Slightly higher
    label = FALSE,
    image.alpha = 0.2,
    alpha = c(1, 0.2),
    cols = celltype_cols
  ) +
    NoLegend() +
    ggtitle(ct) +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12))
  return(p)
})

names(celltype_plots) <- celltypes_to_plot

Now we have a list of cell type specific plots

celltype_plots$Fibroblasts + celltype_plots$Luminal.mature

We can wrap them all in a single patchwork:

# Combine all plots into a grid
final_plot <- wrap_plots(celltype_plots, ncol = 5)

# Display the final combined plot
final_plot

Advanced Analysis

Pseudotime

Now we will subset for only epithelial cells and run pseudotime analysis using Monocle3

Subset for epithelial cells

Subset the data to the cell types of interest

# Subset for epithelial cells only
object_epi <- subset(
  object,
  idents = c(
    "Luminal.progenitor",
    "Luminal.mature",
    "Tumor",
    "Basal_Myoepithelial"
  )
)

DefaultAssay(object_epi) <- "Spatial.008um"

Because we’ve subset, we’ll reperform FindVariableFeatures(), as the features that are variable across epithelial cells only may differ from those in the entire dataset

object_epi <- FindVariableFeatures(object_epi)
object_epi
An object of class Seurat 
40170 features across 10352 samples within 3 assays 
Active assay: Spatial.008um (18085 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 2 other assays present: BANKSY, sketch
 6 dimensional reductions calculated: pca.banksy, umap.banksy, pca.sketch, umap.sketch, full.pca.sketch, full.umap.sketch
 1 spatial field of view present: slice1.008um

Rerun the Banksy workflow on this subset

# Run Banksy
object_epi <- RunBanksy(object_epi,
  lambda = 0.8,
  verbose = TRUE,
  assay = "Spatial.008um",
  slot = "data",
  features = "variable",
  k_geom = 24
)

object_epi <- RunPCA(object_epi,
  assay = "BANKSY",
  reduction.name = "pca.banksy",
  features = rownames(object_epi),
  npcs = 20
)

ElbowPlot(object_epi, reduction = "pca.banksy")

object_epi <- FindNeighbors(object_epi,
  reduction = "pca.banksy",
  dims = 1:15
)

object_epi <- FindClusters(object_epi,
  cluster.name = "banksy_cluster",
  resolution = 0.15
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 10352
Number of edges: 249083

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9291
Number of communities: 9
Elapsed time: 0 seconds
object_epi <- RunUMAP(object_epi,
  reduction = "pca.banksy",
  reduction.name = "umap.banksy",
  return.model = T,
  dims = 1:15
)

Now we can see what this looks like

p2 <- SpatialDimPlot(object_epi,
  group.by = "banksy_cluster",
  pt.size.factor = pt_size_factor,
  label = F,
  label.size = 5,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_fill_manual(values = distinct_palette) +
  ggtitle("Banksy Neighborhood Clustering") +
  theme(legend.key.size = unit(0.5, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

p3 <- DimPlot(object_epi,
  group.by = "banksy_cluster",
  reduction = "umap.banksy",
  label = TRUE,
  label.size = 2,
  raster = FALSE
) +
  scale_color_manual(values = distinct_palette) +
  ggtitle("Banksy Neighborhood Clustering") +
  theme(legend.key.size = unit(0.5, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

final_plot <- wrap_plots(p2, p3, ncol = 2)
final_plot

Monocle3 Analysis

Now we run the monocle3 pseudotime analysis tool. First we create the monocle3 specific cell_data_set object. See the documentation for more information.

library(monocle3)

# Create Monocle CDS
expression_matrix <- object_epi@assays[["Spatial.008um"]]@layers[["counts"]]
colnames(expression_matrix) <- colnames(object_epi@assays[["Spatial.008um"]])
rownames(expression_matrix) <- rownames(object_epi@assays[["Spatial.008um"]])

metadata <- object@meta.data[colnames(expression_matrix), ]

# Monocle3 complains otherwise
metadata$sample_name <- NULL

gene_annotation <- data.frame(gene_short_name = rownames(expression_matrix))
rownames(gene_annotation) <- rownames(expression_matrix)

cds <- new_cell_data_set(expression_matrix,
  cell_metadata = metadata,
  gene_metadata = gene_annotation
)
cds
class: cell_data_set 
dim: 18085 10352 
metadata(1): cds_version
assays(1): counts
rownames(18085): SAMD11 NOC2L ... MT-ND6 MT-CYB
rowData names(1): gene_short_name
colnames(10352): s_008um_00594_00250-1 s_008um_00549_00222-1 ...
  s_008um_00603_00164-1 s_008um_00548_00195-1
colData names(31): orig.ident nCount_Spatial.008um ... cell_type
  Size_Factor
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

Now, we run preprocessing, dimension reduction, cell clustering and learn the cell principal graph within the monocle3 framework

cds <- preprocess_cds(cds, num_dim = 10)

cds <- reduce_dimension(cds)

cds <- cluster_cells(cds,
  reduction_method = "UMAP",
  max_clusters = 6
)

cds <- learn_graph(cds, use_partition = F)

Let’s see what the UMAP looks like with the celltypes

plot_cells(cds,
  show_trajectory_graph = FALSE,
  label_groups_by_cluster = FALSE,
  label_branch_points = FALSE,
  label_principal_points = FALSE,
  color_cells_by = "cell_type"
) +
  scale_color_manual(values = epi_type_cols) +
  theme(legend.position = "right")

As per monocle3’s documentation: It’s often desirable to specify the root of the trajectory programmatically, rather than manually picking it. The function below does so by first grouping the cells according to which trajectory graph node they are nearest to. Then, it calculates what fraction of the cells at each node come from the earliest time point. Then it picks the node that is most heavily occupied by early cells and returns that as the root.

# a helper function to identify the root principal points:
get_earliest_principal_node <- function(
    cds,
    time_bin = c("Luminal.mature", "Luminal.progenitor")) {
  cell_ids <- which(colData(cds)[, "cell_type"] == time_bin)

  closest_vertex <-
    cds@principal_graph_aux[["UMAP"]]$pr_graph_cell_proj_closest_vertex
  closest_vertex <- as.matrix(closest_vertex[colnames(cds), ])
  root_pr_nodes <-
    igraph::V(principal_graph(cds)[["UMAP"]])$name[as.numeric(names
    (which.max(table(closest_vertex[cell_ids, ]))))]

  root_pr_nodes
}
cds <- order_cells(cds, root_pr_nodes = get_earliest_principal_node(cds))

Now let’s plot the cells with pseudotime overlaid. We can create a nice patchworked plot

p0 <- plot_cells(cds,
  show_trajectory_graph = FALSE,
  label_groups_by_cluster = FALSE,
  label_branch_points = FALSE,
  label_principal_points = FALSE,
  color_cells_by = "cell_type"
) +
  theme(legend.position = "right") +
  scale_color_manual(values = epi_type_cols) +
  labs(title = "Cell Type", color = NULL)

p1 <- plot_cells(cds,
  color_cells_by = "pseudotime",
  label_cell_groups = FALSE,
  label_leaves = FALSE,
  label_branch_points = FALSE,
  graph_label_size = 1.5
) +
  labs(title = "Pseudotime")

p2 <- plot_cells(cds,
  color_cells_by = "cluster",
  label_groups_by_cluster = F, # Ensures clusters are labeled
  label_leaves = FALSE,
  label_branch_points = FALSE,
  group_label_size = 4
) +
  scale_color_manual(values = distinct_palette) +
  theme(legend.position = "right") +
  labs(title = "Monocle3 Cluster")

p3 <- plot_cells(cds,
  color_cells_by = "banksy_cluster",
  label_groups_by_cluster = FALSE,
  label_leaves = FALSE,
  label_branch_points = FALSE,
  group_label_size = 4
) +
  scale_color_manual(values = distinct_palette) +
  theme(legend.position = "right") +
  labs(title = "Banksy Cluster")

pfinal <- p0 + p1 + p2 + p3 + plot_layout(ncol = 2)
pfinal

Split by cell type with pseudotime overlaid

plot_cells(cds,
  color_cells_by = "pseudotime",
  label_groups_by_cluster = F,
  label_leaves = F,
  label_branch_points = F
) +
  facet_wrap(~cell_type, nrow = 2) +
  scale_color_gradientn(
    colors = c("blue3", "palegreen3", "yellow", "tomato", "firebrick")
  ) +
  labs(color = "Pseudotime")

Split by banksy clusters

plot_cells(cds,
  color_cells_by = "pseudotime",
  label_groups_by_cluster = F,
  label_leaves = F,
  label_branch_points = F
) +
  facet_wrap(~banksy_cluster, ncol = 4) +
  scale_color_gradientn(
    colors = c("blue3", "palegreen3", "yellow", "tomato", "firebrick")
  ) +
  labs(color = "Pseudotime")

Add this back to our original object

clusters <- cds@clusters@listData[["UMAP"]][["clusters"]]
pseudotime <- cds@principal_graph_aux@listData[["UMAP"]][["pseudotime"]]

object_epi <- AddMetaData(object_epi,
  metadata = clusters,
  col.name = "monocle3_cluster"
)
object_epi <- AddMetaData(object_epi,
  metadata = pseudotime,
  col.name = "monocle3_pseudotime"
)

Deal with infinite pseudotime values

## Plot
DefaultAssay(object) <- "Spatial.008um"
Idents(object) <- "monocle3_cluster"

summary(object_epi$monocle3_pseudotime)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    9.50   23.99   20.91   31.20   37.85 
max_finite <- max(object_epi$monocle3_pseudotime[is.finite(object_epi$monocle3_pseudotime)], na.rm = TRUE)

object_epi$monocle3_pseudotime[is.infinite(object_epi$monocle3_pseudotime)] <- max_finite
FeaturePlot(object_epi, features = "monocle3_pseudotime") +
  scale_color_gradientn(
    colors = c("blue3", "palegreen3", "yellow", "tomato", "firebrick"),
    name = "monocle3_pseudotime"
  ) +
  ggtitle("Monocle Pseudotime") +
  theme(
    legend.key.size = unit(0.4, "cm")
  )

# Generate the SpatialDimPlot
SpatialFeaturePlot(
  object_epi,
  features = "monocle3_pseudotime",
  pt.size.factor = pt_size_factor,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_color_gradientn(colors = c("blue3", "palegreen3", "yellow", "tomato", "firebrick")) +
  ggtitle("Monocle Pseudotime") +
  theme(
    legend.position = "right",
    legend.key.size = unit(0.5, "cm")
  ) +
  labs(color = "pseudotime")

DefaultAssay(object) <- "Spatial.008um"
Idents(object) <- "monocle3_cluster"

p2 <- SpatialDimPlot(object_epi,
  group.by = "monocle3_cluster",
  pt.size.factor = pt_size_factor,
  label = F,
  label.size = 5,
  image.alpha = 0.2,
  alpha = c(1, 1)
) +
  scale_fill_manual(values = distinct_palette) +
  ggtitle("Monocle3 Clustering") +
  theme(legend.key.size = unit(0.5, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 5)))


p3 <- DimPlot(object_epi,
  reduction = "umap.banksy",
  label = TRUE,
  label.size = 2,
  raster = FALSE
) +
  scale_color_manual(values = distinct_palette) +
  ggtitle("Monocle3 Clustering") +
  theme(legend.key.size = unit(0.5, "cm")) +
  guides(fill = guide_legend(override.aes = list(size = 5)))

final_plot <- wrap_plots(p2, p3, ncol = 2)
final_plot

Let’s see how pseudotime breaks down by these different variables. We use fct_reorder() from the forcats package to reprder our x-axis variables by the median pseudotime

library(forcats)

object_epi@meta.data %>%
  ggplot(aes(x = fct_reorder(cell_type, pseudotime, median), y = pseudotime)) +
  geom_violin(aes(fill = cell_type), scale = "width", show.legend = F) +
  geom_boxplot(width = 0.1) +
  scale_fill_manual(values = epi_type_cols) +
  labs(x = "Cell Type", y = "Pseudotime") +
  guides(x = guide_axis(angle = 45)) +
  theme_classic()

object_epi@meta.data %>%
  ggplot(aes(x = fct_reorder(banksy_cluster, pseudotime, median), y = pseudotime)) +
  geom_violin(aes(fill = banksy_cluster), scale = "width", show.legend = F) +
  geom_boxplot(width = 0.1) +
  scale_fill_manual(values = distinct_palette) +
  labs(x = "Banksy Cluster", y = "Pseudotime") +
  theme_classic()

object_epi@meta.data %>%
  ggplot(aes(x = fct_reorder(monocle3_cluster, pseudotime, median), y = pseudotime)) +
  geom_violin(aes(fill = monocle3_cluster), scale = "width", show.legend = F) +
  geom_boxplot(width = 0.1) +
  scale_fill_manual(values = distinct_palette) +
  labs(x = "Monocle3 Cluster", y = "Pseudotime") +
  theme_classic()

Let’s extract csv files of the cell types and clusters to add to Loupe Browser

outdir <- "data/"
sample_name <- "Visium_DCIS"

metadata <- object@meta.data[, c("seurat_cluster.projected", "banksy_cluster", "cell_type")] %>%
  tibble::rownames_to_column(var = "CellID")

export_cluster_proj <- metadata[, c("CellID", "seurat_cluster.projected")]
export_cluster_banksy <- metadata[, c("CellID", "banksy_cluster")]
export_celltype <- metadata[, c("CellID", "cell_type")]

# Write to CSV
write.csv(export_cluster_proj, file = file.path(outdir, paste0(sample_name, "_cluster_proj.csv")), row.names = FALSE)
write.csv(export_cluster_banksy, file = file.path(outdir, paste0(sample_name, "_cluster_banksy.csv")), row.names = FALSE)
write.csv(export_celltype, file = file.path(outdir, paste0(sample_name, "_cell_type.csv")), row.names = FALSE)