【中药网络药理学】筛选细胞衰老和预后相关基因(附分类代码和画图代码)

avatar
作者
猴君
阅读量:0

具体代码和详细的中药网络药理学代码以及讲解可以看我的github(欢迎star)doge

https://github.com/qianwei1129/TCM-Network-Pharmacology

1、衰老相关基因

HAGRmsigdb数据获取细胞衰老相关基因,将两者取交集后构建基因蛋白互作网络

HAGR数据库

该库本身提供了下载链接,我在下载后对其进行了清洗

msigdb数据库

以"aging"作为关键词,Search Filters中collection设置为"all collections",source species设置为"Homo sapiens",contributors设置为"all contributions"

在msigdb数据库中共得到56个衰老相关基因集

2、小细胞肝癌预后相关基因_TCGA数据库

library("survival") library("survminer") 

使用TCGA数据库通过TCGA-LIHC数据集(2023年1月,n=377),下载clinical(临床信息),点击Metadata(样本信息),cart(基因文件)[/data/TCGA数据库]获取转录组测序数据与临床数据,并排除了临床数据不完整的患者队列

我使用了Harmony软件包进行批次效应调整,

接下来,基于拟合多因素Cox回归和lasso回归,构建预后效果评价分类模型

以"stranded_first"作为特定基因的表达水平评价量,临床信息中选择vital_status:生存状态(例如,存活、死亡)和days_to_death:诊断后至死亡的天数,作为评价的临床指标,最后整合得到如下表格:

病例ID基因名生存时间结局
ID1GeneA时间1结果1
ID2GeneB时间2结果2
ID3GeneC时间3结果3

描述性分析

首先使用生存分析中的Cox比例风险模型建立一个风险评分系统,然后利用风险评分中位数对其进行分组,将其分为“高风险组”和”低风险组“,并绘制生存分析图(针对筛选出的基因)

# 生存分析 load("files/exp_clinical")  install.packages("survival")  install.packages("survminer") install.packages("glmnet") install.packages("caret")  library(survival) library(survminer) library(glmnet) library(dplyr) library(caret)  # 绘制Kaplan-Meier曲线 exp_clinical <- as.data.frame(t(exp_clinical)) exp_clinical$time <- as.numeric(exp_clinical$time) exp_clinical <- exp_clinical[!is.na(exp_clinical$time), ]  # 观察小范围内数据框 View(exp_clinical[(1:50), (1:50)]) View(gene_data[(1:50), (1:50)]) View(normalized_data[(1:50), (1:50)])  gene_data <- exp_clinical[(1:(nrow(exp_clinical)-5)), ] clinical_data <- exp_clinical[((nrow(exp_clinical)-4) : nrow(exp_clinical)), ]   # 选择至少在一个样本中表达量高于该基因90%分位数的基因 sapply(gene_data, class)  gene_data[is.na(gene_data)] <- 0 gene_data[] <- lapply(gene_data, function(x) as.numeric(as.character(x)))  gene_data_scale <- as.data.frame(scale(gene_data))  normalize_minmax <- function(x) {   return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))) }  gene_data <- as.data.frame(lapply(gene_data_scale, normalize_minmax))  names(gene_data) <- names(clinical_data) data <- rbind(clinical_data, gene_data) data <- as.data.frame(t(data))  rownames(data)[6:nrow(data)] <- rownames(exp_clinical)[1:(nrow(exp_clinical)-5)] save(data, file = "files/gene_clinical_data")   # 做相关性分析 load("files/gene_clinical_data")  View(data[(1:50), (1:50)])  
rm(list = ls())  load("files/exp_clinical") load("files/agging_HCC_tcm_gene")  library(ggplot2) library(ggpubr) library(survival) library(survminer) library(glmnet) library(mice) library(dplyr) library(car) library(randomForestSRC) library(caret)   ## 预处理 rownames(exp_clinical) <- data_rownames exp_clinical <- replace(exp_clinical, is.na(exp_clinical), 0) temp <- as.data.frame(t(exp_clinical))  temp <- data.frame(lapply(temp, as.numeric)) temp <- replace(temp, is.na(temp), 0)  colnames(temp) <- data_rownames colnames(temp)  gene_data <- as.data.frame(temp[, unlist(aging_HCC_tcm_gene)]) temp_gene_data <- as.data.frame(lapply(gene_data, as.numeric))  str(temp_gene_data)  temp_gene_data <- scale(temp_gene_data) gene_data <- as.data.frame(temp_gene_data) # temp <- scale(gene_data)  rownames(gene_data) <- t(colnames(exp_clinical))  exp_clinical <- as.data.frame(t(exp_clinical)) clinical_data <- as.data.frame(cbind(exp_clinical$time, exp_clinical$flag))  colnames(clinical_data) <- c("time", "flag")  clinical_data <- as.data.frame(lapply(clinical_data, as.numeric)) clinical_data[is.na(clinical_data)] <- 0  sum(clinical_data$flag)  rownames(clinical_data) <- rownames(gene_data)  rows_to_remove <- rownames(clinical_data[clinical_data$time == 0, ])  gene_data <- gene_data[!rownames(gene_data) %in% rows_to_remove, ] clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ] # clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ]  # rownames(gene_data) <- t(colnames(exp_clinical))  # clinical_data <- as.matrix(temp[, c("time", "flag")]) # clinical_data <- as.data.frame(clinical_data) #  # clinical_data <- as.data.frame(lapply(clinical_data, as.numeric)) # sum(clinical_data$flag) #  data <- cbind(clinical_data, gene_data) # # temp_data <- cbind(clinical_data, temp) #  # sum(is.na(clinical_data)) # sum(is.na(gene_data))  # gene_data <- preProcess(gene_data, method = "range")  save(data, gene_data, clinical_data, file = "files/surv_pre_data")   ## 风险回归 rm(list = ls()) load("files/surv_pre_data")  str(clinical_data) str(gene_data)  custom_control <- coxph.control(eps = 1e-09,                                  iter.max = 10000,                                  toler.chol = 1e-10)  surv_model <- coxph(Surv(time, flag) ~ .,                      data = data,                      control = custom_control)  # temp_surv_model <- coxph( #  Surv(time, flag) ~ .,  #  data = temp_data,  #  control = coxph.control(iter.max = 10000) #)  surv_model$coefficients  # surv_model <- temp_surv_model  coefficients <- as.data.frame(surv_model$coefficients) coefficients[is.na(coefficients)] <- 0 coefficients  gene_data <- data.frame(gene_data) gene_coef_data <- rbind(gene_data, t(coefficients)) rownames(gene_coef_data)[nrow(gene_coef_data)] <- c("coef")  surv_results <- sweep(gene_data, 2, t(coefficients), `*`) surv_row_sums <- rowSums(surv_results)  surv_row_sums[is.na(surv_row_sums)] <- 0 surv_row_sums  surv_median_value <- median(surv_row_sums) surv_median_value   # 分类并添加分组 risk_group <- ifelse(surv_row_sums > surv_median_value,                      "High Risk",                      "Low Risk") surv_results$risk_group <- risk_group final_surv_results <- surv_results[, !apply(surv_results,                                        2,                                       function(x) all(x == 0))]  save(data,       risk_group,      gene_data,       clinical_data,       coefficients,       final_surv_results,       file = "files/surv_final_data")  # 绘制生存曲线 rm(list = ls())  load("files/surv_final_data")  colnames(final_surv_results)   temp_data <- data temp_data$time <- ceiling(temp_data$time/30) # temp_data <- lapply(temp_data, as.numeric)  # 剔除大于5年的 rows_to_remove <- rownames(temp_data[temp_data$time > 60, ]) temp_data <- temp_data[!rownames(temp_data) %in% rows_to_remove, ] clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ] risk_group <- as.data.frame(risk_group) risk_group <- risk_group[!rownames(risk_group) %in% rows_to_remove, ]   temp_surv_obj <- Surv(temp_data$time, temp_data$flag) temp_data <- cbind(temp_data, risk_group) temp_surv_fit <- survfit(temp_surv_obj ~ risk_group,                      data = temp_data) temp_max <- max(temp_data$time) ggsurvplot(temp_surv_fit,             data = temp_data,            risk.table = TRUE, # 显示风险表            pval = TRUE, # 显示P值            conf.int = TRUE, # 显示置信区间            xlim = c(0, temp_max), # 可选:设置X轴的范围            xlab = "Times(months)", # 设置X轴标签            ylab = "Survival probability", # 设置Y轴标签 )  coefficients$gene_name <- rownames(coefficients) colnames(coefficients)[1] <- c("coef") writexl::write_xlsx(coefficients, "data/washed_data/coef.xlsx")  surv_obj <- Surv(data$time, data$flag) surv_fit <- survfit(surv_obj ~ risk_group,                      data = data) max <- max(data$time) ggsurvplot(surv_fit,             data = final_surv_results,            risk.table = TRUE, # 显示风险表            pval = TRUE, # 显示P值            conf.int = TRUE, # 显示置信区间            xlim = c(0, max), # 可选:设置X轴的范围            xlab = "Times(days)", # 设置X轴标签            ylab = "Survival probability", # 设置Y轴标签            title = "Kaplan-Meier 生存曲线" # 设置图标题 ) 

    广告一刻

    为您即时展示最新活动产品广告消息,让您随时掌握产品活动新动态!