阅读量:0
具体代码和详细的中药网络药理学代码以及讲解可以看我的github(欢迎star)doge
https://github.com/qianwei1129/TCM-Network-Pharmacology
1、衰老相关基因
从HAGR和msigdb数据获取细胞衰老相关基因,将两者取交集后构建基因蛋白互作网络
HAGR数据库
该库本身提供了下载链接,我在下载后对其进行了清洗
msigdb数据库
以"aging"作为关键词,Search Filters中collection设置为"all collections",source species设置为"Homo sapiens",contributors设置为"all contributions"
在msigdb数据库中共得到56个衰老相关基因集
2、小细胞肝癌预后相关基因_TCGA数据库
library("survival") library("survminer")
使用TCGA数据库通过TCGA-LIHC数据集(2023年1月,n=377),下载clinical(临床信息),点击Metadata(样本信息),cart(基因文件)[/data/TCGA数据库]获取转录组测序数据与临床数据,并排除了临床数据不完整的患者队列
我使用了Harmony软件包进行批次效应调整,
接下来,基于拟合多因素Cox回归和lasso回归,构建预后效果评价分类模型
以"stranded_first"作为特定基因的表达水平评价量,临床信息中选择vital_status:生存状态(例如,存活、死亡)和days_to_death:诊断后至死亡的天数,作为评价的临床指标,最后整合得到如下表格:
病例ID | 基因名 | 生存时间 | 结局 |
---|---|---|---|
ID1 | GeneA | 时间1 | 结果1 |
ID2 | GeneB | 时间2 | 结果2 |
ID3 | GeneC | 时间3 | 结果3 |
描述性分析
首先使用生存分析中的Cox比例风险模型建立一个风险评分系统,然后利用风险评分中位数对其进行分组,将其分为“高风险组”和”低风险组“,并绘制生存分析图(针对筛选出的基因)
# 生存分析 load("files/exp_clinical") install.packages("survival") install.packages("survminer") install.packages("glmnet") install.packages("caret") library(survival) library(survminer) library(glmnet) library(dplyr) library(caret) # 绘制Kaplan-Meier曲线 exp_clinical <- as.data.frame(t(exp_clinical)) exp_clinical$time <- as.numeric(exp_clinical$time) exp_clinical <- exp_clinical[!is.na(exp_clinical$time), ] # 观察小范围内数据框 View(exp_clinical[(1:50), (1:50)]) View(gene_data[(1:50), (1:50)]) View(normalized_data[(1:50), (1:50)]) gene_data <- exp_clinical[(1:(nrow(exp_clinical)-5)), ] clinical_data <- exp_clinical[((nrow(exp_clinical)-4) : nrow(exp_clinical)), ] # 选择至少在一个样本中表达量高于该基因90%分位数的基因 sapply(gene_data, class) gene_data[is.na(gene_data)] <- 0 gene_data[] <- lapply(gene_data, function(x) as.numeric(as.character(x))) gene_data_scale <- as.data.frame(scale(gene_data)) normalize_minmax <- function(x) { return((x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))) } gene_data <- as.data.frame(lapply(gene_data_scale, normalize_minmax)) names(gene_data) <- names(clinical_data) data <- rbind(clinical_data, gene_data) data <- as.data.frame(t(data)) rownames(data)[6:nrow(data)] <- rownames(exp_clinical)[1:(nrow(exp_clinical)-5)] save(data, file = "files/gene_clinical_data") # 做相关性分析 load("files/gene_clinical_data") View(data[(1:50), (1:50)])
rm(list = ls()) load("files/exp_clinical") load("files/agging_HCC_tcm_gene") library(ggplot2) library(ggpubr) library(survival) library(survminer) library(glmnet) library(mice) library(dplyr) library(car) library(randomForestSRC) library(caret) ## 预处理 rownames(exp_clinical) <- data_rownames exp_clinical <- replace(exp_clinical, is.na(exp_clinical), 0) temp <- as.data.frame(t(exp_clinical)) temp <- data.frame(lapply(temp, as.numeric)) temp <- replace(temp, is.na(temp), 0) colnames(temp) <- data_rownames colnames(temp) gene_data <- as.data.frame(temp[, unlist(aging_HCC_tcm_gene)]) temp_gene_data <- as.data.frame(lapply(gene_data, as.numeric)) str(temp_gene_data) temp_gene_data <- scale(temp_gene_data) gene_data <- as.data.frame(temp_gene_data) # temp <- scale(gene_data) rownames(gene_data) <- t(colnames(exp_clinical)) exp_clinical <- as.data.frame(t(exp_clinical)) clinical_data <- as.data.frame(cbind(exp_clinical$time, exp_clinical$flag)) colnames(clinical_data) <- c("time", "flag") clinical_data <- as.data.frame(lapply(clinical_data, as.numeric)) clinical_data[is.na(clinical_data)] <- 0 sum(clinical_data$flag) rownames(clinical_data) <- rownames(gene_data) rows_to_remove <- rownames(clinical_data[clinical_data$time == 0, ]) gene_data <- gene_data[!rownames(gene_data) %in% rows_to_remove, ] clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ] # clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ] # rownames(gene_data) <- t(colnames(exp_clinical)) # clinical_data <- as.matrix(temp[, c("time", "flag")]) # clinical_data <- as.data.frame(clinical_data) # # clinical_data <- as.data.frame(lapply(clinical_data, as.numeric)) # sum(clinical_data$flag) # data <- cbind(clinical_data, gene_data) # # temp_data <- cbind(clinical_data, temp) # # sum(is.na(clinical_data)) # sum(is.na(gene_data)) # gene_data <- preProcess(gene_data, method = "range") save(data, gene_data, clinical_data, file = "files/surv_pre_data") ## 风险回归 rm(list = ls()) load("files/surv_pre_data") str(clinical_data) str(gene_data) custom_control <- coxph.control(eps = 1e-09, iter.max = 10000, toler.chol = 1e-10) surv_model <- coxph(Surv(time, flag) ~ ., data = data, control = custom_control) # temp_surv_model <- coxph( # Surv(time, flag) ~ ., # data = temp_data, # control = coxph.control(iter.max = 10000) #) surv_model$coefficients # surv_model <- temp_surv_model coefficients <- as.data.frame(surv_model$coefficients) coefficients[is.na(coefficients)] <- 0 coefficients gene_data <- data.frame(gene_data) gene_coef_data <- rbind(gene_data, t(coefficients)) rownames(gene_coef_data)[nrow(gene_coef_data)] <- c("coef") surv_results <- sweep(gene_data, 2, t(coefficients), `*`) surv_row_sums <- rowSums(surv_results) surv_row_sums[is.na(surv_row_sums)] <- 0 surv_row_sums surv_median_value <- median(surv_row_sums) surv_median_value # 分类并添加分组 risk_group <- ifelse(surv_row_sums > surv_median_value, "High Risk", "Low Risk") surv_results$risk_group <- risk_group final_surv_results <- surv_results[, !apply(surv_results, 2, function(x) all(x == 0))] save(data, risk_group, gene_data, clinical_data, coefficients, final_surv_results, file = "files/surv_final_data") # 绘制生存曲线 rm(list = ls()) load("files/surv_final_data") colnames(final_surv_results) temp_data <- data temp_data$time <- ceiling(temp_data$time/30) # temp_data <- lapply(temp_data, as.numeric) # 剔除大于5年的 rows_to_remove <- rownames(temp_data[temp_data$time > 60, ]) temp_data <- temp_data[!rownames(temp_data) %in% rows_to_remove, ] clinical_data <- clinical_data[!rownames(clinical_data) %in% rows_to_remove, ] risk_group <- as.data.frame(risk_group) risk_group <- risk_group[!rownames(risk_group) %in% rows_to_remove, ] temp_surv_obj <- Surv(temp_data$time, temp_data$flag) temp_data <- cbind(temp_data, risk_group) temp_surv_fit <- survfit(temp_surv_obj ~ risk_group, data = temp_data) temp_max <- max(temp_data$time) ggsurvplot(temp_surv_fit, data = temp_data, risk.table = TRUE, # 显示风险表 pval = TRUE, # 显示P值 conf.int = TRUE, # 显示置信区间 xlim = c(0, temp_max), # 可选:设置X轴的范围 xlab = "Times(months)", # 设置X轴标签 ylab = "Survival probability", # 设置Y轴标签 ) coefficients$gene_name <- rownames(coefficients) colnames(coefficients)[1] <- c("coef") writexl::write_xlsx(coefficients, "data/washed_data/coef.xlsx") surv_obj <- Surv(data$time, data$flag) surv_fit <- survfit(surv_obj ~ risk_group, data = data) max <- max(data$time) ggsurvplot(surv_fit, data = final_surv_results, risk.table = TRUE, # 显示风险表 pval = TRUE, # 显示P值 conf.int = TRUE, # 显示置信区间 xlim = c(0, max), # 可选:设置X轴的范围 xlab = "Times(days)", # 设置X轴标签 ylab = "Survival probability", # 设置Y轴标签 title = "Kaplan-Meier 生存曲线" # 设置图标题 )