Bioinformatics

머신러닝을 이용한 DEG 분석

JOFTWARE 2021. 8. 25. 14:28

DEG by Machine Learning

  1. 데이터를 사용하여
  2. 기계학습 모델을 학습시키고
  3. 기계학습 모델에서 중요하게 사용된 특성(feature)에 대해
  4. 특성 중요도(feature importance)를 계산

=> 특성 중요도(feature importance)를 기준으로 DEG 선정

Data

Decision Tree

 

Random Forest Generation

 

 

Random Forest Prediction

 

Feature Importance

 실습 코드

import pandas as pd
data=pd.read_csv("LUAD.txt",delimiter="\t",skiprows=lambda x: x == 1,index_col=0)

lst_labels = []
for col in data.columns:
    # if col = "TCGA-05-4244-01A-01R-1107-07", then sample_type = "01A"
    sample_type = col.split('-')[3] 
    if sample_type.startswith("01"): # primary tumor
        lst_labels.append('cancer')
    elif sample_type.startswith("02"): # reccurent solid tumor
        lst_labels.append('cancer')
    elif sample_type.startswith("10"): # blood derived normal
        lst_labels.append('normal')
    elif sample_type.startswith("11"): # solid tissue normal
        lst_labels.append('normal')
    else:
        lst_labels.append('normal')
        print("Warnning: sample type out of options")
    
import sklearn
# random forest for feature importance on a classification problem
from sklearn.ensemble import RandomForestClassifier
from matplotlib import pyplot
# define the model
model = RandomForestClassifier(n_estimators=2000)
# fit the model
model.fit(data.T, lst_labels)

importance = model.feature_importances_

import seaborn as sns
import matplotlib.pyplot as plt

fi_df = pd.DataFrame({'feature_names':data.index,'feature_importance':importance})

#Sort the DataFrame in order decreasing feature importance
fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

#Define size of bar plot
plt.figure(figsize=(10,8))
#Add chart labels
plt.title('FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
#Plot Searborn bar chart
sns.barplot(x=list(fi_df.head(n=1000)['feature_importance']), y=list(fi_df.head(n=1000)['feature_names']))
plt.show()

import numpy as np
lst_log2fold_change = []
lst_overone_ratio=[]

for i in range(data.shape[0]):
    # load i-th row (i.e, data of gene i)
    line= list(data.iloc[i,:])
    
    # For computing log2 fold change of gene i
    lst_cancer_vals=[]
    lst_normal_vals=[]
    for j in range(len(line)):
        if lst_labels[j] == 'cancer':
            lst_cancer_vals.append(line[j])
        elif lst_labels[j] == 'normal':
            lst_normal_vals.append(line[j])
    log2fold_change = np.log2((np.mean(lst_cancer_vals)+1) / (np.mean(lst_normal_vals)+1))
    lst_log2fold_change.append(log2fold_change)

    # For computing over one ration of gene i
    lst_overone=[]
    for j in range(len(line)):
        if line[j] > 1:
            lst_overone.append(1)
        else:
            lst_overone.append(0)
    overone_ratio = sum(lst_overone)/len(lst_overone)
    lst_overone_ratio.append(overone_ratio)    

lst_isDEG=[]
for i in range(data.shape[0]):
    log2fold_change = lst_log2fold_change[i]
    feature_importance = importance[i]
    if feature_importance > 0.00001 and log2fold_change > 0:
        isDEG = 1
    elif feature_importance > 0.00001 and log2fold_change < 0:
        isDEG = -1
    else:
        isDEG = 0
    lst_isDEG.append(isDEG)
    
res_final = pd.DataFrame({'gene':data.index,'feature_importance':importance,'RFDEG':lst_isDEG})
res_final["RFDEG"].value
_counts()

 

728x90