ifnb.list <- SplitObject(ifnb, split.by = "stim")
# normalize and identify variable features for each dataset independently
ifnb.list <- lapply(X = ifnb.list, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
features <- SelectIntegrationFeatures(object.list = ifnb.list)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, anchor.features = features)
# this command creates an 'integrated' data assay
immune.combined <- IntegrateData(anchorset = immune.anchors)
sceList = lapply(folders,function(folder){
CreateSeuratObject(counts = Read10X(folder),
project = folder )
})
sce.big <- merge(sceList[[1]],
y = c(sceList[[2]],sceList[[3]],sceList[[4]],sceList[[5]],
sceList[[6]],sceList[[7]],sceList[[8]]),
add.cell.ids = folders,
project = "mouse8")
sce.big <- NormalizeData(sce.big)
sce.big <- FindVariableFeatures(sce.big, selection.method = "vst", nfeatures = 2000)
####有的人会在scale的时候矫正一点样本的变异来源
sce.big <- ScaleData(sce.big, verbose = FALSE,vars.to.regress = "sample") ###样本信息列
,
ScaleData()和
FindVariableFeatures()`,但是内部的算法原理还是不同,这里简单给大家介绍一下,对算法不感兴趣的略过,但是要记住,SCTransform不是功能上替换了上述三个函数,但是计算上有不同。其中where xi is the vector of UMI counts assigned to gene i and m is the vector of molecules assigned to the cells, i.e.,
一般来说,对于线性回归我们有多种方式求解参数,比如最小二乘法、极大似然法等。极大似然法要求我们提出一个残差项的分布模型。这里残差项应该指的就是公式1中等号右侧的值减去等号左侧的值。此残差分布目前生物学上较多的使用了负二项分布的模型。
确定好模型后我们就能对每个基因都评估一组参数(β0 and the slope β1 and θ ):the intercept β0 and the slope β1. The dispersion parameter θ of the underlying NB distribution is also unknown and needs to be estimated from the data.
2. 对于每个基因推断的参数,文章第二步使用了核回归的方式去让表达量相近的基因所对应的参数保持一致,这里主要是为了避免第一步用每个基因推断的参数所具有的不确定性较大的问题。We capture these trends using a kernel regression estimate (ksmooth function in R). We use a normal kernel and first select a kernel bandwidth using the R function bw.SJ. We multiply this by a bandwidth adjustment factor (BAF, default value of 3)。这一步之后所得到的参数被称为regularized regression parameters。
3. 获得regularized regression parameters之后,第三步就是将参数和cell总UMI数重新带入公式1,去计算预期的基因表达量,与实际的基因表达量相减获得残差项,将残差项除以负二项分布推断的标准差后就是pearson residuals,也就是最后的标准化结果。
ifnb.list <- SplitObject(ifnb, split.by = "stim")
ifnb.list <- lapply(X = ifnb.list, FUN = SCTransform) ###每个样本SCTransform处理
features <- SelectIntegrationFeatures(object.list = ifnb.list, nfeatures = 3000)
ifnb.list <- PrepSCTIntegration(object.list = ifnb.list, anchor.features = features)
immune.anchors <- FindIntegrationAnchors(object.list = ifnb.list, normalization.method = "SCT",
anchor.features = features)
immune.combined.sct <- IntegrateData(anchorset = immune.anchors, normalization.method = "SCT")
sceList = lapply(folders,function(folder){
CreateSeuratObject(counts = Read10X(folder),
project = folder )
})
sce.big <- merge(sceList[[1]],
y = c(sceList[[2]],sceList[[3]],sceList[[4]],sceList[[5]],
sceList[[6]],sceList[[7]],sceList[[8]]),
add.cell.ids = folders,
project = "mouse8")
####矫正一点样本的批次来源
sce.big <- SCTransform(sce.big, vars.to.regress = "sample", verbose = FALSE)
sceList = lapply(folders,function(folder){
CreateSeuratObject(counts = Read10X(folder),
project = folder )
})
sce.big <- merge(sceList[[1]],
y = c(sceList[[2]],sceList[[3]],sceList[[4]],sceList[[5]],
sceList[[6]],sceList[[7]],sceList[[8]]),
add.cell.ids = folders,
project = "mouse8")
sce.big <- NormalizeData(sce.big)
sce.big <- FindVariableFeatures(sce.big, selection.method = "vst", nfeatures = 2000) %>% ScaleData(verbose = FALSE) %>% RunPCA(pc.gene = sce.big@var.genes,npcs = 20,verbose = FALSE)
###Scale可以选择所有基因还是高变基因
sce.big <- RunHarmony(sce.big, group.by.vars = "sample")
我发现还些人CCA和harmony一起用,这是错误的,我就不展示代码了。
SCT && harmony:
sceList = lapply(folders,function(folder){
CreateSeuratObject(counts = Read10X(folder),
project = folder )
})
sce.big <- merge(sceList[[1]],
y = c(sceList[[2]],sceList[[3]],sceList[[4]],sceList[[5]],
sceList[[6]],sceList[[7]],sceList[[8]]),
add.cell.ids = folders,
project = "mouse8")
####矫正一点样本的批次来源
sce.big <- SCTransform(sce.big, vars.to.regress = "sample", verbose = FALSE)
DefaultAssay(sce.big) = 'SCT'
sce.big <- ScaleData(sce.big,verbose = FALSE) %>% RunPCA(pc.gene = sce.big@var.genes,npcs = 20,verbose = FALSE) %>% RunHarmony(group.by.vars = "sample")
过矫正的问题