###R code for the differential gene expression analysis library(tidyverse) library(Biobase) library(limma) library(DESeq2) library(edgeR) library(Seurat) #limma ####main parameters: #eBayes: Given a linear model fit from lmFit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a global value. #coef: Column number or column name specifying which coefficient or contrast of the linear model is of interest. #number: Maximum number of genes to list #Accession: dataset ID, GPL: platform ID #exprSet: Microarray expression data #### exprSet = read_delim(paste0(readdir,Accession,'_',GPL,".txt"),delim = "\t") #Building a grouping matrix class<-factor(c(rep("control",length(control)),rep("case",length(case)))) design=model.matrix(~0+class) dimnames(design) = list(c(control,case),c("case","control")) #differential analysis contrast.matrix <- makeContrasts('case-control',levels = design) fit=lmFit(log2_exprSet,design) %>% contrasts.fit(., contrast.matrix) %>% eBayes(.) allDiff_result=topTable(fit,coef=1,number=Inf) #DESeq2 ####main parameters: #minReplicatesForReplace: The minimum number of replicates required in order to use replaceOutliers on a sample. If there are samples with so many replicates, the model will be refit after these replacing outliers. #exprSet: The expression profile file that has undergone standard upstream analysis processes. #### exprSet <- exprSet [rowSums(exprSet )>0,] #Filtering low-expressed genes #Building a grouping matrix condition<-factor(c(rep("control",length(control)),rep("case",length(case))),levels=c("control","case")) colData <- data.frame(row.names = colnames(exprSet ),condition) #differential analysis dds <- DESeqDataSetFromMatrix(exprSet , colData, design = ~condition) dds <- DESeq(dds,minReplicatesForReplace = 40) allDiff_result <- results(dds) #edgeR ####main parameters: #bcv: Biological coefficient of variance,used to estimate dispersion #dispersion: Used to compute genewise exact tests for differences in the means between two groups of negative-binomially distributed counts. #calcNormFactors: Calculate scaling factors to convert the raw library sizes for a set of sequenced samples into normalized effective library sizes. #### #Building a grouping matrix class<-factor(c(rep("control",length(control)),rep("case",length(case)))) groups<-factor(c(rep("control",length(control)),rep("case",length(case))),levels=c("control","case")) #differential analysis y <- DGEList(counts = exprSet , group = groups) keep <- rowSums(cpm(y)>1) >= 1 y <- y[keep, ,keep.lib.sizes = FALSE] #Filtering low-expressed genes y <- calcNormFactors(y) #Set bcv to 0.4 for humans and 0.1 for mice if(species == 'homo_sapiens'){bcv = 0.4}else{bcv = 0.1} et <- exactTest(y, dispersion = bcv ^ 2) allDiff<- as.data.frame(et$table) allDiff$p.adjust <- p.adjustallDiff$PValue, method="fdr") allDiff_result <- allDiff[,c('logFC','PValue','p.adjust')] #scrna analysis ####main parameters: FindMarkers: Finds markers (differentially expressed genes) for identity classes #ident.1: case cluster, ident.2: control cluster #group.by: Regroup cells into a different identity class prior to performing differential expression #### #diff_df <- FindMarkers(object,ident.1 = case, ident.2 = control,group.by = 'group') ### main workflow ### library(Seurat) library(harmony) library(parallel) library(dplyr) library(reshape2) library(data.table) library(HGNChelper) library(readxl) library(reshape2) library(stringr) library(tibble) library(readr) library(clusterProfiler) library(org.Hs.eg.db) library(org.Mm.eg.db) library(ggplot2) library(enrichplot) library(tidyverse) options(bitmapType='cairo') db_ = "D:/scRNAseq/RNAtherapy/chaosuanxiazai/file/ScTypeDB_full_old.xlsx" #sctypeDB path ## SCTransform + harmony # Cell filter function filter <- function(sce){ sce[["percent.mt"]] <- PercentageFeatureSet(sce, pattern = "^M(?i:T)-") sce[["percent.HB"]] <- PercentageFeatureSet(sce, pattern = "H(?i:B)") ### (?i:)Indicates that the character is not case sensitive ## Set the threshold standard, mainly nFeature_RNA, set the 90% threshold maxnF <- max(sce@meta.data$nFeature_RNA) nF_cutoff <- round(0.9*maxnF) sce <- subset(sce, subset = nFeature_RNA > 200 & nFeature_RNA < nF_cutoff & percent.mt < 10) } # There are two clustering functions: one is that a GSE has multiple samples that need to be integrated and analyzed (cluster_multi_SRR), and the other is that a GSE has only one sample and does not need to be integrated and analyzed (cluster_one_SRR) cluster_multi_SRR <- function(sce_list,GSE){ ye <- sce_list[[2]] if(length(sce_list)>2){ for(k in 3:length(sce_list)){ ye <- append(ye,sce_list[[k]]) } }else{ ye <- ye } scRNA <- merge(sce_list[[1]],y = ye,project = GSE) ## SCT normalizes data, replacing NormalizeData, FindVariableFeatures, and ScaleData functions print('SCTransform start!') scRNA <- SCTransform(scRNA,vars.to.regress = "percent.mt",verbose = FALSE) scRNA <- RunPCA(scRNA) scRNA <- RunHarmony(scRNA,group.by.vars = "orig.ident",assay.use = "SCT",max.iter.harmony = 10) # group.by.vars is used to set the group to be integrated # max.iter.harmony, Set the number of iterations, the default is 10. The result of running RunHarmony will indicate the number of iterations after which convergence is achieved. # There is a lambda parameter in the RunHarmony function. The default value is 1, which determines the strength of Harmony integration. The smaller the lambda value, the greater the integration strength, and vice versa. (Only this parameter affects the integration strength, and the adjustment range is generally between 0.5-2) ### Best PC # Determine percent of variation associated with each PC pct <- scRNA [["pca"]]@stdev / sum( scRNA [["pca"]]@stdev) * 100 # Calculate cumulative percents for each PC cumu <- cumsum(pct) # Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5 # The cumulative contribution of the principal components is greater than 90%. #PC itself contributes less than 5% to the variance co1 <- which(cumu > 90 & pct < 5)[1] # Determine the difference between variation of PC and subsequent PC. The difference between two consecutive PCs is less than 0.1% co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 # last point where change of % of variation is more than 0.1%. # Minimum of the two calculation pcs <- min(co1, co2) pc.num <- 1:pcs res <- c(0.2,0.4,0.6,0.8,1.0) # If the data contains duplicates, RunTSNE will fail. The solution is to add a check_duplicates = FALSE line scRNA <- RunTSNE(scRNA, reduction="harmony",check_duplicates = FALSE,dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num) %>% FindNeighbors(reduction="harmony", dims=pc.num) %>% FindClusters(resolution=res) return(scRNA) } cluster_one_SRR <- function(sce_list,GSE){ scRNA <- SCTransform(sce_list[[1]],vars.to.regress = "percent.mt",verbose = FALSE) scRNA <- RunPCA(scRNA) pct <- scRNA [["pca"]]@stdev / sum( scRNA [["pca"]]@stdev) * 100 cumu <- cumsum(pct) co1 <- which(cumu > 90 & pct < 5)[1] co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1 pcs <- min(co1, co2) pc.num <- 1:pcs res <- c(0.2,0.4,0.6,0.8,1.0) scRNA <- RunTSNE(scRNA, reduction="pca",dims=pc.num) %>% RunUMAP(reduction="pca", dims=pc.num) %>% FindNeighbors(reduction="pca", dims=pc.num) %>% FindClusters(resolution=res) return(scRNA) } ### Marker gene output marker <- function(scRNA,rslt,studyid){ org <- unique(scRNA$organism) res <- c(0.2,0.4,0.6,0.8,1.0) for(clu in c(0.2,0.4,0.6,0.8,1.0)){ Idents(scRNA) <- paste0("SCT_snn_res.",clu) cluster_marker <- FindAllMarkers(scRNA, assay = 'SCT', slot = 'data', only.pos = TRUE, verbose = TRUE) write.csv(cluster_marker,file = paste0(rslt,'/cluster/',studyid,"_SCT_snn_res.",clu,"_marker.csv"),row.names = F) } } ### Output by celltype marker_ct <- function(scRNA,rslt,studyid){ org <- unique(scRNA$organism) res <- c(0.2,0.4,0.6,0.8,1.0) for(clu in c(0.2,0.4,0.6,0.8,1.0)){ Idents(scRNA) <- paste0("SCT_snn_res.",clu,"_celltype") celltype_marker <- FindAllMarkers(scRNA, assay = 'SCT', slot = 'data', only.pos = TRUE, verbose = TRUE) write.csv(celltype_marker,file = paste0(rslt,'/celltype/',studyid,"_SCT_snn_res.",clu,"_marker.csv"),row.names = F) } } #### Different groups (if any) were analyzed for differences diff <- function(scRNA,rslt,res,control,case,studyid){ future::plan('multisession',workers=8) scRNA_bak <- scRNA Idents(scRNA_bak) <- res all_diff <- data.frame() for(clu in 0:(length(table(Idents(scRNA_bak)))-1)){ sub_object <- subset(scRNA_bak,idents = clu) Idents(sub_object) <- 'group' tryCatch({ diff_df <- FindMarkers(sub_object,ident.1 = case, ident.2 = control,group.by = 'group') diff_df$clusters <- clu diff_df$resolution <- res diff_df = diff_df %>% rownames_to_column('gene') all_diff <- rbind(all_diff,diff_df) },error=function(e){ sink(paste0(errorpath,'diff_errlog.txt'),append = T) print(paste0(studyid,'_',res,'_',clu)) sink() }) } future::plan('multisession',workers=1) write.csv(all_diff,paste0(rslt,'/cluster/',studyid,'_',res,'_diff.csv'),row.names = F) } #### Different groups (if any) were analyzed by cell type diff_ct <- function(scRNA,rslt,res,control,case,studyid){ future::plan('multisession',workers=8) scRNA_bak <- scRNA Idents(scRNA_bak) <- res all_diff <- data.frame() for(clu in names(table(Idents(scRNA_bak)))){ sub_object <- subset(scRNA_bak,idents = clu) Idents(sub_object) <- 'group' tryCatch({ diff_df <- FindMarkers(sub_object,ident.1 = case, ident.2 = control,group.by = 'group') diff_df$clusters <- clu diff_df$resolution <- res diff_df = diff_df %>% rownames_to_column('gene') all_diff <- rbind(all_diff,diff_df) },error=function(e){ sink(paste0(errorpath,'diff_errlog.txt'),append = T) print(paste0(studyid,'_',res,'_celltype',clu)) sink() }) future::plan('multisession',workers=1) } write.csv(all_diff,paste0(rslt,'/celltype/',studyid,'_',res,'_diff.csv'),row.names = F) } ### Different groups, overall differential expression analysis diff_whole <- function(scRNA,rslt,control,case,studyid){ future::plan('multisession',workers=8) scRNA_bak <- scRNA Idents(scRNA_bak) <- 'group' whole_diff <- FindMarkers(scRNA_bak,ident.1 = case, ident.2 = control,group.by = 'group',logfc.threshold = -1, min.pct = 0) whole_diff = whole_diff %>% rownames_to_column('gene') future::plan('multisession',workers=1) write.csv(whole_diff,paste0(rslt,'/',studyid,'_whole_diff.csv'),row.names = F) } organism_orgDb = data.frame(organism = c("Mus musculus","Homo sapiens"), OrgDbs = c('org.Mm.eg.db','org.Hs.eg.db')) output_files = function(rslt,studyid){ allDiff = read_delim(paste0(rslt,'/',studyid,'_whole_diff.csv')) studyinfo <- dataset[dataset$UniID == studyid,] organism = studyinfo$Organism orgDb = organism_orgDb[match(organism,organism_orgDb$organism),2] dir.create(paste0(rslt,'/common_analysis')) dir.create(paste0(rslt,'/common_analysis/pics')) file_trans = bitr(unlist(allDiff[,1]),"SYMBOL","ENTREZID",orgDb) colnames(allDiff)[1]=colnames(file_trans)[1] allDiff=inner_join(file_trans,allDiff,by='SYMBOL') allDiff=allDiff[-1] write.table(allDiff,paste0(rslt,'/',studyid,'_whole_diff.csv'),row.names = F,sep = "\t",quote = F) mostDiff = allDiff %>% dplyr::filter(p_val_adj < 0.05 & abs(avg_log2FC) > 0) %>% arrange(desc(abs(avg_log2FC))) #%>% head(2000) # the most up upmostdiff = mostDiff %>% dplyr::filter(avg_log2FC > 0) write.table(upmostdiff,paste0(rslt,"/common_analysis/",studyid,"_upmostdiff.txt"),row.names = F,sep = "\t",quote = F) # the most down downmostdiff = mostDiff %>% dplyr::filter(avg_log2FC < 0) write.table(downmostdiff,paste0(rslt,"/common_analysis/",studyid,"_downmostdiff.txt"),row.names = F,sep = "\t",quote = F) } scanalysis <- function(platform,expdir,studyid,outdir,dataset){ if(!dir.exists(outdir))(dir.create(outdir)) if(!dir.exists(paste0(outdir,'/',platform)))(dir.create(paste0(outdir,'/',platform))) if(!dir.exists(paste0(outdir,'/',platform,'/result')))(dir.create(paste0(outdir,'/',platform,'/result'))) if(!dir.exists(paste0(outdir,'/',platform,'/result/',studyid)))(dir.create(paste0(outdir,'/',platform,'/result/',studyid))) rslt <- paste0(outdir,'/',platform,'/result/',studyid) studyinfo <- dataset[dataset$UniID == studyid,] Con <- unlist(str_split(studyinfo$Control,pattern = ';')) Case <- unlist(str_split(studyinfo$Case,pattern = ';')) GSM_id <- append(Con, Case) GSE <- studyinfo$Accession if(platform == '10x'){ sce_list <- list() for(GSM in GSM_id){ if(GSM %in% Con){ group <- 'control' }else{ group <- 'case' } if(file.exists(paste0(outdir,'/',platform,'/Rds/',GSM,'.Rds'))){ organism <- studyinfo$Organism tissue <- studyinfo$tissue_type sce_list[[GSM]] <- readRDS(paste0(outdir,'/',platform,'/Rds/',GSM,'.Rds')) # Add a prefix to the cell barcode to prevent duplication of barcode names after merging sce_list[[GSM]] <- RenameCells(sce_list[[GSM]], add.cell.id = GSM) sce_list[[GSM]]$orig.ident <- GSM sce_list[[GSM]]$organism <- organism sce_list[[GSM]]$tissue <- tissue sce_list[[GSM]]$group <- group }else{ sink(paste0(errorpath,'read_fail.txt'),append = T) print(paste0(GSE,".Rds read failed")) sink() } } sce_list <- lapply(sce_list,filter) print('cluster_multi_SRR!') print(sce_list) scRNA <- cluster_multi_SRR(sce_list,GSE) } ### Add corresponding celltype to scRNA object control <- 'control' case <- 'case' tissue <- unique(scRNA$tissue) sctype = openxlsx::read.xlsx(db_) if(tissue %in% unique(sctype$tissueType)){ celltypeinfo <- read_csv(paste0(rslt,'/',studyid,'_tsne_2D.csv')) %>% as.data.frame() %>% .[,c(11:15)] for(c in colnames(celltypeinfo)){ scRNA <- AddMetaData(scRNA,metadata = celltypeinfo[,c],col.name = c) } saveRDS(scRNA,file = paste0(rslt,'/',studyid,'.Rds')) marker_ct(scRNA,rslt,studyid) for(res in c('SCT_snn_res.0.2_celltype','SCT_snn_res.0.4_celltype','SCT_snn_res.0.6_celltype','SCT_snn_res.0.8_celltype','SCT_snn_res.1_celltype')){ switch (res, 'SCT_snn_res.0.2_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.4_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.6_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.8_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.1_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid) ) } }else{ saveRDS(scRNA,file = paste0(rslt,'/',studyid,'.Rds')) } marker(scRNA,rslt,studyid) for(res in c('SCT_snn_res.0.2','SCT_snn_res.0.4','SCT_snn_res.0.6','SCT_snn_res.0.8','SCT_snn_res.1')){ switch (res, 'SCT_snn_res.0.2' = diff(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.4' = diff(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.6' = diff(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.0.8' = diff(scRNA,rslt,res,control,case,studyid), 'SCT_snn_res.1' = diff(scRNA,rslt,res,control,case,studyid) ) } diff_whole(scRNA,rslt,control,case,studyid) rm(scRNA,scRNA_bak,sub_object,sce_list);gc() output_files(rslt,studyid) print('output_files over!') print(paste0(studyid,': Analysis has been completed!')) }