In [1]:
import sys
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
PROJECT_DIR = os.getenv("PROJECT_DIR")
sys.path.append(PROJECT_DIR+'/src')
import paths
from GraphRepresentation5 import GraphRepresentation5
from AptRank import AptRank
from GraphBasedDiffusion import GraphBasedDiffusion
from RandomWalkDiffusion import RandomWalkDiffusion
from prospective import prospective
from IPython import embed
from time import time
from utility import setNetworks, generateDegree1net
import pandas as pd
from collections import defaultdict as ddict
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob as glob
import numpy as np
//anaconda/envs/Diffusion2018/lib/python2.7/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

Set up experiments

In [2]:
# Files and data location
FDAdrugsFile = paths.PROJECT_DIR + '/data/other/full database_12132016_FDAdrugsFormatedmappedFiltered.txt'
MeSH_name_file1 = paths.PROJECT_DIR + '/data/other/c2016.bin_dict'
MeSH_name_file2 = paths.PROJECT_DIR + '/data/other/d2016.bin_dict'
experiment = 'DG'
In [3]:
# Load network settings
networkNames, conns, selectedQueryModes, selectedPredictionModes = setNetworks(2016, 'Prospective', experiment, paths.NETWORK_MAPPING_FILE, paths.NETWORK_DATA_DIR)
print('Going to load networks: {} for {} experiment'.format(networkNames, experiment))
print('Query modes: {}'.format(selectedQueryModes))
print('Prediction modes: {}'.format(selectedPredictionModes))
Going to load networks: ['CTD-DG', 'CTD', 'CTD+STRING+MeSH'] for DG experiment
Query modes: ['Disease', 'Disease', 'Disease']
Prediction modes: ['Gene', 'Gene', 'Gene']
In [4]:
# Set algorithms for testing
algorithms = [GraphBasedDiffusion, RandomWalkDiffusion, AptRank]
algorithmNames = ['GID', 'RW', 'AptRank']

# Trim iteratively the nodes which have degree = 1 in the last input network
# (Assuming the last input network is the largest one)
_net, removedNodes = generateDegree1net(conns[-1], paths.NETWORK_MAPPING_FILE)

# Load a query list of 10 diseases as input
inputListFile = PROJECT_DIR + '/data/prospective_input_list/top10DieasesFormatedmappedFiltered.txt'
inputList = pd.read_csv(inputListFile, sep="\t", header=None)[0].values
inputListInitialNames = pd.read_csv(
    inputListFile, sep="\t", header=None)[4].values
inputPrefix = inputListFile.split('/')[-1].split('.')[0]
print('Initial names: {}'.format(inputListInitialNames),'Mapped id: {}'.format(inputList))
Loading graph took: 0.19804824988 min
Loading graph took: 0.201136136055 min
Saving graph took: 1.58150990804e-06 min
Graph initialization took: 8.79764556885e-06 min
("Initial names: ['Coronary Artery Disease' 'Stroke' 'Pneumonia'\n 'Pulmonary Disease, Chronic Obstructive' 'Diabetes Mellitus'\n 'Lung Neoplasms' 'Colonic Neoplasms' 'Pancreatic Neoplasms'\n 'Breast Neoplasms' 'Liver Neoplasms']", "Mapped id: ['D.2756' 'D.10680' 'D.9003' 'D.9461' 'D.3266' 'D.6563' 'D.2503' 'D.8534'\n 'D.1475' 'D.6506']")

Run experiments for different combination of algorithms and networks

In [5]:
labels = []
results = ddict(list)
# Only run the first network for testing
# The Largest network requires 256GB of memory
networkNames = [networkNames[0]]
for algorithm, algorithmName in zip(algorithms, algorithmNames):
    for i in range(len(networkNames)):
        currQueryMode = selectedQueryModes[i]
        currPredictionMode = selectedPredictionModes[i]
        print 'Running Prospective for {} using {}'.format(experiment, algorithmName)
        previous = time()
        currNet, _removedNodes = generateDegree1net(
            conns[i], paths.NETWORK_MAPPING_FILE, removedNodes)
        currName = networkNames[i]
        later = time()
        print('Graph generation took {} min'.format((later - previous) / 60))
        previous = time()
        ###################################################################
        print(currNet)
        print('Generating prospective object for {}'.format(currName))
        validation = prospective(name=inputPrefix, algorithm=algorithm, network=currNet,
                                 selectedEntityList=inputList, selectedEntityInitName=inputListInitialNames,
                                 selectedPredictionMode=currPredictionMode,
                                 selectedQueryMode=currQueryMode,
                                 mappingFile=paths.NETWORK_MAPPING_FILE, FDAdrugsFile=FDAdrugsFile,
                                 MeSH_name_file1=MeSH_name_file1, MeSH_name_file2=MeSH_name_file2,
                                 splitPerformanceByEntity=True)
        validation.perform()
        currentResult = validation.getResults()
        labels.append("%s %s" % (algorithmName, currName))
        for x in (validation.savedVars):
            results[x].append(currentResult[x])
Running Prospective for DG using GID
Loading graph took: 0.0246601661046 min
Loading graph took: 0.0352401336034 min
Saving graph took: 1.2993812561e-06 min
Graph initialization took: 5.9167544047e-06 min
Graph generation took 0.0599335312843 min
Graph object with name: Mapping_Ccgdd16Sx91Extended_101217_CTD2016DG_Degree_1_Retained_Nodes
Total Node Count: 7674
	Chemical Nodes: 0
	Gene Nodes: 5390
	Disease Nodes: 2284
Edges: 22362
	Chemical-Chemical Edges: 0
	Chemical-Gene Edges: 0
	Gene-Gene Edges: 0
	Chemical-Disease Edges: 0
	Disease-Disease Edges: 0
	Gene-Disease Edges: 22362
Generating prospective object for CTD-DG
Took 4.061460495e-05 min to return adjacency matrix
Took 4.38332557678e-05 min to return adjacency matrix
0 known entities are missing.
10 entities were mapped and labeled
Reloading pre-existing saved validation performance metrics
Predictions took 0.0634755531947 min to validate
Running Prospective for DG using RW
Loading graph took: 0.0248582164447 min
Loading graph took: 0.0378374973933 min
Saving graph took: 1.43448511759e-06 min
Graph initialization took: 8.16980997721e-06 min
Graph generation took 0.0627234180768 min
Graph object with name: Mapping_Ccgdd16Sx91Extended_101217_CTD2016DG_Degree_1_Retained_Nodes
Total Node Count: 7674
	Chemical Nodes: 0
	Gene Nodes: 5390
	Disease Nodes: 2284
Edges: 22362
	Chemical-Chemical Edges: 0
	Chemical-Gene Edges: 0
	Gene-Gene Edges: 0
	Chemical-Disease Edges: 0
	Disease-Disease Edges: 0
	Gene-Disease Edges: 22362
Generating prospective object for CTD-DG
Took 4.06503677368e-05 min to return adjacency matrix
Took 4.23987706502e-05 min to return adjacency matrix
0 known entities are missing.
10 entities were mapped and labeled
Reloading pre-existing saved validation performance metrics
Predictions took 0.0649570504824 min to validate
Running Prospective for DG using AptRank
Loading graph took: 0.02486085097 min
Loading graph took: 0.0400495489438 min
Saving graph took: 2.36829121908e-06 min
Graph initialization took: 8.30094019572e-06 min
Graph generation took 0.0649436513583 min
Graph object with name: Mapping_Ccgdd16Sx91Extended_101217_CTD2016DG_Degree_1_Retained_Nodes
Total Node Count: 7674
	Chemical Nodes: 0
	Gene Nodes: 5390
	Disease Nodes: 2284
Edges: 22362
	Chemical-Chemical Edges: 0
	Chemical-Gene Edges: 0
	Gene-Gene Edges: 0
	Chemical-Disease Edges: 0
	Disease-Disease Edges: 0
	Gene-Disease Edges: 22362
Generating prospective object for CTD-DG
Took 3.99827957153e-05 min to return adjacency matrix
0 known entities are missing.
10 entities were mapped and labeled
Reloading pre-existing saved validation performance metrics
Predictions took 0.0724524140358 min to validate
In [6]:
# Produce a dataframe for plotting bootstrapped AUC median
data = pd.DataFrame(results['AUC_bootstrapMedian'], columns = inputListInitialNames, index=labels).T
# Plot a heatmap of bootstrapped AUC median
%matplotlib inline
plt.figure(figsize=(6,7))
sns.set(font_scale=2)
sns.heatmap(data=data, cmap = 'YlGnBu', vmin=0.6,vmax=0.9)
plt.xticks(rotation=90)
Out[6]:
(array([ 0.5,  1.5,  2.5]), <a list of 3 Text xticklabel objects>)
In [7]:
# Load one of the predictions for visualization
pred_files = glob(paths.VALIDATION_RESULT_DIR+'/prospective/AptRank1_CTD2016DG_Degree_1*/*_pred.tsv')
df = pd.read_csv(pred_files[0], sep="\t")
sortby = 'Breast Neoplasms'
df.index=df['Synonyms']
# Subset the predictions which are not known in the networks
df_score = df.loc[df['known_'+sortby]==0]
col_score = [x for x in df.columns if 'score_' in x]
df_score = df_score[col_score]
df_score.columns = [x.replace('score_','') for x in df_score.columns]
df_score = df_score.sort_values(sortby)
# df_score.tail().iloc[::-1]

An interactive heatmap for viewing predictions:

  • Use cursor to select/crop part of the heatmap to zoom in
  • Hover cursor to view the x, y, and prediction values (z)
  • Click three times or click 'Autosclale'/'Reset axes' (on the right panel) to zoom out
In [8]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
trace = go.Heatmap(z=df_score.values,
                   x=df_score.columns,
                   y=df_score.index,
                   reversescale=True,
                   colorscale=[[0, 'rgb(165,0,38)'], 
                              [0.91111111111111111, 'rgb(215,48,39)'], 
                              [0.92222222222222222, 'rgb(244,109,67)'], 
                              [0.93333333333333333, 'rgb(253,174,97)'], 
                              [0.94444444444444444, 'rgb(254,224,144)'], 
                              [0.95555555555555556, 'rgb(224,243,248)'], 
                              [0.96666666666666666, 'rgb(171,217,233)'], 
                              [0.97777777777777778, 'rgb(116,173,209)'], 
                              [0.98888888888888888, 'rgb(69,117,180)'], 
                              [1, 'rgb(49,54,149)']])
data=[trace]
iplot(data, filename='labelled-heatmap')