###R code for the differential gene expression analysis

library(tidyverse)
library(Biobase)
library(limma)
library(DESeq2)
library(edgeR)
library(Seurat)


#limma
####main parameters:
#eBayes: Given a linear model fit from lmFit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a global value.
#coef: Column number or column name specifying which coefficient or contrast of the linear model is of interest.
#number: Maximum number of genes to list
#Accession: dataset ID, GPL: platform ID
#exprSet: Microarray expression data 
####
exprSet = read_delim(paste0(readdir,Accession,'_',GPL,".txt"),delim = "\t")
#Building a grouping matrix
class<-factor(c(rep("control",length(control)),rep("case",length(case))))
design=model.matrix(~0+class)
dimnames(design) = list(c(control,case),c("case","control"))
#differential analysis
contrast.matrix <- makeContrasts('case-control',levels = design)
fit=lmFit(log2_exprSet,design) %>% contrasts.fit(., contrast.matrix) %>% eBayes(.)
allDiff_result=topTable(fit,coef=1,number=Inf)

#DESeq2
####main parameters:
#minReplicatesForReplace: The minimum number of replicates required in order to use replaceOutliers on a sample. If there are samples with so many replicates, the model will be refit after these replacing outliers.
#exprSet: The expression profile file that has undergone standard upstream analysis processes.
####
exprSet <- exprSet [rowSums(exprSet )>0,] #Filtering low-expressed genes
#Building a grouping matrix
condition<-factor(c(rep("control",length(control)),rep("case",length(case))),levels=c("control","case"))
colData <- data.frame(row.names = colnames(exprSet ),condition)
#differential analysis
dds <- DESeqDataSetFromMatrix(exprSet , colData, design = ~condition)
dds <- DESeq(dds,minReplicatesForReplace = 40)
allDiff_result <- results(dds)

#edgeR
####main parameters:
#bcv: Biological coefficient of variance,used to estimate dispersion
#dispersion: Used to compute genewise exact tests for differences in the means between two groups of negative-binomially distributed counts.
#calcNormFactors: Calculate scaling factors to convert the raw library sizes for a set of sequenced samples into normalized effective library sizes.
####
#Building a grouping matrix
class<-factor(c(rep("control",length(control)),rep("case",length(case))))
groups<-factor(c(rep("control",length(control)),rep("case",length(case))),levels=c("control","case"))
#differential analysis
y <- DGEList(counts = exprSet , group = groups)
keep <- rowSums(cpm(y)>1) >= 1  	
y <- y[keep, ,keep.lib.sizes = FALSE]	#Filtering low-expressed genes
y <- calcNormFactors(y)
#Set bcv to 0.4 for humans and 0.1 for mice
if(species == 'homo_sapiens'){bcv = 0.4}else{bcv = 0.1}
et <- exactTest(y, dispersion = bcv ^ 2)
allDiff<- as.data.frame(et$table)
allDiff$p.adjust <- p.adjustallDiff$PValue, method="fdr")
allDiff_result <- allDiff[,c('logFC','PValue','p.adjust')]

#scrna analysis
####main parameters:
FindMarkers: Finds markers (differentially expressed genes) for identity classes
#ident.1: case cluster, ident.2: control cluster
#group.by: Regroup cells into a different identity class prior to performing differential expression
####
#diff_df <- FindMarkers(object,ident.1 = case, ident.2 = control,group.by = 'group')
### main workflow ###
library(Seurat)
library(harmony)
library(parallel)
library(dplyr)
library(reshape2)
library(data.table)
library(HGNChelper)
library(readxl)
library(reshape2)
library(stringr)
library(tibble)
library(readr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(ggplot2)
library(enrichplot)
library(tidyverse)
options(bitmapType='cairo')
db_ = "D:/scRNAseq/RNAtherapy/chaosuanxiazai/file/ScTypeDB_full_old.xlsx"  #sctypeDB path
## SCTransform + harmony 
# Cell filter function
filter <- function(sce){
  sce[["percent.mt"]] <- PercentageFeatureSet(sce, pattern = "^M(?i:T)-")
  sce[["percent.HB"]] <- PercentageFeatureSet(sce, pattern = "H(?i:B)")   ### (?i:)Indicates that the character is not case sensitive
  ## Set the threshold standard, mainly nFeature_RNA, set the 90% threshold
  maxnF <- max(sce@meta.data$nFeature_RNA)
  nF_cutoff <- round(0.9*maxnF)
  sce <- subset(sce, subset = nFeature_RNA > 200 & nFeature_RNA < nF_cutoff & percent.mt < 10)
}
# There are two clustering functions: one is that a GSE has multiple samples that need to be integrated and analyzed (cluster_multi_SRR), and the other is that a GSE has only one sample and does not need to be integrated and analyzed (cluster_one_SRR)
cluster_multi_SRR <- function(sce_list,GSE){
  
  ye <- sce_list[[2]]
  if(length(sce_list)>2){
    for(k in 3:length(sce_list)){
      ye <- append(ye,sce_list[[k]])
    }
  }else{
    ye <- ye
  }
  scRNA <- merge(sce_list[[1]],y = ye,project = GSE)
  ## SCT normalizes data, replacing NormalizeData, FindVariableFeatures, and ScaleData functions
  print('SCTransform start!')
  scRNA <- SCTransform(scRNA,vars.to.regress = "percent.mt",verbose = FALSE)
  scRNA <- RunPCA(scRNA)
  scRNA <- RunHarmony(scRNA,group.by.vars = "orig.ident",assay.use = "SCT",max.iter.harmony = 10)
  # group.by.vars is used to set the group to be integrated
  # max.iter.harmony, Set the number of iterations, the default is 10. The result of running RunHarmony will indicate the number of iterations after which convergence is achieved.
  # There is a lambda parameter in the RunHarmony function. The default value is 1, which determines the strength of Harmony integration. The smaller the lambda value, the greater the integration strength, and vice versa. (Only this parameter affects the integration strength, and the adjustment range is generally between 0.5-2)
  
  ### Best PC
  # Determine percent of variation associated with each PC
  pct <- scRNA [["pca"]]@stdev / sum( scRNA [["pca"]]@stdev) * 100
  # Calculate cumulative percents for each PC
  cumu <- cumsum(pct)
  # Determine which PC exhibits cumulative percent greater than 90% and % variation associated with the PC as less than 5
  # The cumulative contribution of the principal components is greater than 90%.
  #PC itself contributes less than 5% to the variance
  co1 <- which(cumu > 90 & pct < 5)[1]
  # Determine the difference between variation of PC and subsequent PC. The difference between two consecutive PCs is less than 0.1%
  co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
  # last point where change of % of variation is more than 0.1%.
  # Minimum of the two calculation
  pcs <- min(co1, co2)
  pc.num <- 1:pcs
  res <-  c(0.2,0.4,0.6,0.8,1.0)
  # If the data contains duplicates, RunTSNE will fail. The solution is to add a check_duplicates = FALSE line
  scRNA <- RunTSNE(scRNA, reduction="harmony",check_duplicates = FALSE,dims=pc.num) %>% RunUMAP(reduction="harmony", dims=pc.num) %>%
    FindNeighbors(reduction="harmony", dims=pc.num) %>% FindClusters(resolution=res)
  return(scRNA)
}
cluster_one_SRR <- function(sce_list,GSE){
  scRNA <- SCTransform(sce_list[[1]],vars.to.regress = "percent.mt",verbose = FALSE)
  scRNA <- RunPCA(scRNA)
  pct <- scRNA [["pca"]]@stdev / sum( scRNA [["pca"]]@stdev) * 100
  cumu <- cumsum(pct)
  co1 <- which(cumu > 90 & pct < 5)[1]
  co2 <- sort(which((pct[1:length(pct) - 1] - pct[2:length(pct)]) > 0.1), decreasing = T)[1] + 1
  pcs <- min(co1, co2)
  pc.num <- 1:pcs
  res <-  c(0.2,0.4,0.6,0.8,1.0)
  scRNA <- RunTSNE(scRNA, reduction="pca",dims=pc.num) %>% RunUMAP(reduction="pca", dims=pc.num) %>%
    FindNeighbors(reduction="pca", dims=pc.num) %>% FindClusters(resolution=res)
  return(scRNA)
}

### Marker gene output
marker <- function(scRNA,rslt,studyid){
  org <- unique(scRNA$organism)
  res <- c(0.2,0.4,0.6,0.8,1.0)
  for(clu in c(0.2,0.4,0.6,0.8,1.0)){
    Idents(scRNA) <- paste0("SCT_snn_res.",clu)
    cluster_marker <- FindAllMarkers(scRNA, assay = 'SCT',
                                     slot = 'data', only.pos = TRUE, verbose = TRUE)
    write.csv(cluster_marker,file = paste0(rslt,'/cluster/',studyid,"_SCT_snn_res.",clu,"_marker.csv"),row.names = F)
  }
}
### Output by celltype
marker_ct <- function(scRNA,rslt,studyid){
  org <- unique(scRNA$organism)
  res <- c(0.2,0.4,0.6,0.8,1.0)
  for(clu in c(0.2,0.4,0.6,0.8,1.0)){
    Idents(scRNA) <- paste0("SCT_snn_res.",clu,"_celltype")
    celltype_marker <- FindAllMarkers(scRNA, assay = 'SCT',
                                      slot = 'data', only.pos = TRUE, verbose = TRUE)
    write.csv(celltype_marker,file = paste0(rslt,'/celltype/',studyid,"_SCT_snn_res.",clu,"_marker.csv"),row.names = F)
  }
}
#### Different groups (if any) were analyzed for differences
diff <- function(scRNA,rslt,res,control,case,studyid){ 
  future::plan('multisession',workers=8) 
  scRNA_bak <- scRNA
  Idents(scRNA_bak) <- res
  all_diff <- data.frame()
  for(clu in 0:(length(table(Idents(scRNA_bak)))-1)){
    sub_object <- subset(scRNA_bak,idents = clu)
    Idents(sub_object) <- 'group'
    tryCatch({
      diff_df <- FindMarkers(sub_object,ident.1 = case, ident.2 = control,group.by = 'group')
      diff_df$clusters <- clu
      diff_df$resolution <- res
      diff_df = diff_df %>% rownames_to_column('gene')
      all_diff <- rbind(all_diff,diff_df)
    },error=function(e){
      sink(paste0(errorpath,'diff_errlog.txt'),append = T)
      print(paste0(studyid,'_',res,'_',clu))
      sink()
    })
  }
  future::plan('multisession',workers=1) 
  write.csv(all_diff,paste0(rslt,'/cluster/',studyid,'_',res,'_diff.csv'),row.names = F)
}
#### Different groups (if any) were analyzed by cell type
diff_ct <- function(scRNA,rslt,res,control,case,studyid){ 
  future::plan('multisession',workers=8) 
  scRNA_bak <- scRNA
  Idents(scRNA_bak) <- res
  all_diff <- data.frame()
  for(clu in names(table(Idents(scRNA_bak)))){
    sub_object <- subset(scRNA_bak,idents = clu)
    Idents(sub_object) <- 'group'
    tryCatch({
      diff_df <- FindMarkers(sub_object,ident.1 = case, ident.2 = control,group.by = 'group')
      diff_df$clusters <- clu
      diff_df$resolution <- res
      diff_df = diff_df %>% rownames_to_column('gene')
      all_diff <- rbind(all_diff,diff_df)
    },error=function(e){
      sink(paste0(errorpath,'diff_errlog.txt'),append = T)
      print(paste0(studyid,'_',res,'_celltype',clu))
      sink()
    })
  future::plan('multisession',workers=1)
  }
  write.csv(all_diff,paste0(rslt,'/celltype/',studyid,'_',res,'_diff.csv'),row.names = F)
}


### Different groups, overall differential expression analysis
diff_whole <- function(scRNA,rslt,control,case,studyid){
  future::plan('multisession',workers=8)
  scRNA_bak <- scRNA
  Idents(scRNA_bak) <- 'group'
  whole_diff <- FindMarkers(scRNA_bak,ident.1 = case, ident.2 = control,group.by = 'group',logfc.threshold = -1, min.pct = 0)
  whole_diff = whole_diff %>% rownames_to_column('gene')
  future::plan('multisession',workers=1) 
  write.csv(whole_diff,paste0(rslt,'/',studyid,'_whole_diff.csv'),row.names = F)
}


organism_orgDb = data.frame(organism = c("Mus musculus","Homo sapiens"),
                            OrgDbs = c('org.Mm.eg.db','org.Hs.eg.db'))

output_files = function(rslt,studyid){
  allDiff = read_delim(paste0(rslt,'/',studyid,'_whole_diff.csv'))
  studyinfo <- dataset[dataset$UniID == studyid,]
  organism = studyinfo$Organism
  orgDb = organism_orgDb[match(organism,organism_orgDb$organism),2]
  dir.create(paste0(rslt,'/common_analysis'))
  dir.create(paste0(rslt,'/common_analysis/pics'))
  file_trans = bitr(unlist(allDiff[,1]),"SYMBOL","ENTREZID",orgDb)
  colnames(allDiff)[1]=colnames(file_trans)[1]
  allDiff=inner_join(file_trans,allDiff,by='SYMBOL')
  allDiff=allDiff[-1]
  write.table(allDiff,paste0(rslt,'/',studyid,'_whole_diff.csv'),row.names = F,sep = "\t",quote = F)
  mostDiff = allDiff %>% dplyr::filter(p_val_adj < 0.05 & abs(avg_log2FC) > 0) %>% arrange(desc(abs(avg_log2FC))) #%>% head(2000)
  # the most up
  upmostdiff = mostDiff %>% dplyr::filter(avg_log2FC > 0)
  write.table(upmostdiff,paste0(rslt,"/common_analysis/",studyid,"_upmostdiff.txt"),row.names = F,sep = "\t",quote = F)
  # the most down
  downmostdiff = mostDiff %>% dplyr::filter(avg_log2FC < 0)
  write.table(downmostdiff,paste0(rslt,"/common_analysis/",studyid,"_downmostdiff.txt"),row.names = F,sep = "\t",quote = F)
}

scanalysis <- function(platform,expdir,studyid,outdir,dataset){
  if(!dir.exists(outdir))(dir.create(outdir))
  if(!dir.exists(paste0(outdir,'/',platform)))(dir.create(paste0(outdir,'/',platform)))
  if(!dir.exists(paste0(outdir,'/',platform,'/result')))(dir.create(paste0(outdir,'/',platform,'/result')))
  if(!dir.exists(paste0(outdir,'/',platform,'/result/',studyid)))(dir.create(paste0(outdir,'/',platform,'/result/',studyid)))
  rslt <- paste0(outdir,'/',platform,'/result/',studyid)
  
    studyinfo <- dataset[dataset$UniID == studyid,]
  Con <- unlist(str_split(studyinfo$Control,pattern = ';'))
  Case <- unlist(str_split(studyinfo$Case,pattern = ';'))
  GSM_id <- append(Con, Case)
  GSE <- studyinfo$Accession
  if(platform == '10x'){
    sce_list <- list()
    for(GSM in GSM_id){
      if(GSM %in% Con){
        group <- 'control'
      }else{
        group <- 'case'
      }
      if(file.exists(paste0(outdir,'/',platform,'/Rds/',GSM,'.Rds'))){
        organism <- studyinfo$Organism
        tissue <- studyinfo$tissue_type
        sce_list[[GSM]] <- readRDS(paste0(outdir,'/',platform,'/Rds/',GSM,'.Rds'))
        # Add a prefix to the cell barcode to prevent duplication of barcode names after merging
        sce_list[[GSM]] <- RenameCells(sce_list[[GSM]], add.cell.id = GSM)
        sce_list[[GSM]]$orig.ident <- GSM
        sce_list[[GSM]]$organism <- organism
        sce_list[[GSM]]$tissue <- tissue
        sce_list[[GSM]]$group <- group
      }else{
        sink(paste0(errorpath,'read_fail.txt'),append = T)
        print(paste0(GSE,".Rds read failed"))
        sink()
      }
    }
    sce_list <- lapply(sce_list,filter)
    print('cluster_multi_SRR!')
    print(sce_list)
    scRNA <- cluster_multi_SRR(sce_list,GSE)
  }
  ### Add corresponding celltype to scRNA object
  control <- 'control'
  case <- 'case'
  tissue <- unique(scRNA$tissue)
  sctype = openxlsx::read.xlsx(db_)
  if(tissue %in% unique(sctype$tissueType)){
    celltypeinfo <- read_csv(paste0(rslt,'/',studyid,'_tsne_2D.csv')) %>% as.data.frame() %>% .[,c(11:15)] 
    for(c in colnames(celltypeinfo)){
      scRNA <- AddMetaData(scRNA,metadata = celltypeinfo[,c],col.name = c)
    }
    saveRDS(scRNA,file = paste0(rslt,'/',studyid,'.Rds'))
    marker_ct(scRNA,rslt,studyid)
    for(res in c('SCT_snn_res.0.2_celltype','SCT_snn_res.0.4_celltype','SCT_snn_res.0.6_celltype','SCT_snn_res.0.8_celltype','SCT_snn_res.1_celltype')){
      switch (res,
              'SCT_snn_res.0.2_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid),
              'SCT_snn_res.0.4_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid),
              'SCT_snn_res.0.6_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid),
              'SCT_snn_res.0.8_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid),
              'SCT_snn_res.1_celltype' = diff_ct(scRNA,rslt,res,control,case,studyid)
      )
    }
  }else{
    saveRDS(scRNA,file = paste0(rslt,'/',studyid,'.Rds'))
  }
  marker(scRNA,rslt,studyid)
  for(res in c('SCT_snn_res.0.2','SCT_snn_res.0.4','SCT_snn_res.0.6','SCT_snn_res.0.8','SCT_snn_res.1')){
    switch (res,
            'SCT_snn_res.0.2' = diff(scRNA,rslt,res,control,case,studyid),
            'SCT_snn_res.0.4' = diff(scRNA,rslt,res,control,case,studyid),
            'SCT_snn_res.0.6' = diff(scRNA,rslt,res,control,case,studyid),
            'SCT_snn_res.0.8' = diff(scRNA,rslt,res,control,case,studyid),
            'SCT_snn_res.1' = diff(scRNA,rslt,res,control,case,studyid)
    )
  }
  diff_whole(scRNA,rslt,control,case,studyid)
  rm(scRNA,scRNA_bak,sub_object,sce_list);gc()
  output_files(rslt,studyid)
  print('output_files over!')
  print(paste0(studyid,': Analysis has been completed!'))
}