import sys
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
PROJECT_DIR = os.getenv("PROJECT_DIR")
sys.path.append(PROJECT_DIR+'/src')
import paths
from GraphRepresentation5 import GraphRepresentation5
from AptRank import AptRank
from GraphBasedDiffusion import GraphBasedDiffusion
from RandomWalkDiffusion import RandomWalkDiffusion
from prospective import prospective
from IPython import embed
from time import time
from utility import setNetworks, generateDegree1net
import pandas as pd
from collections import defaultdict as ddict
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob as glob
import numpy as np
# Files and data location
FDAdrugsFile = paths.PROJECT_DIR + '/data/other/full database_12132016_FDAdrugsFormatedmappedFiltered.txt'
MeSH_name_file1 = paths.PROJECT_DIR + '/data/other/c2016.bin_dict'
MeSH_name_file2 = paths.PROJECT_DIR + '/data/other/d2016.bin_dict'
experiment = 'DG'
# Load network settings
networkNames, conns, selectedQueryModes, selectedPredictionModes = setNetworks(2016, 'Prospective', experiment, paths.NETWORK_MAPPING_FILE, paths.NETWORK_DATA_DIR)
print('Going to load networks: {} for {} experiment'.format(networkNames, experiment))
print('Query modes: {}'.format(selectedQueryModes))
print('Prediction modes: {}'.format(selectedPredictionModes))
# Set algorithms for testing
algorithms = [GraphBasedDiffusion, RandomWalkDiffusion, AptRank]
algorithmNames = ['GID', 'RW', 'AptRank']
# Trim iteratively the nodes which have degree = 1 in the last input network
# (Assuming the last input network is the largest one)
_net, removedNodes = generateDegree1net(conns[-1], paths.NETWORK_MAPPING_FILE)
# Load a query list of 10 diseases as input
inputListFile = PROJECT_DIR + '/data/prospective_input_list/top10DieasesFormatedmappedFiltered.txt'
inputList = pd.read_csv(inputListFile, sep="\t", header=None)[0].values
inputListInitialNames = pd.read_csv(
inputListFile, sep="\t", header=None)[4].values
inputPrefix = inputListFile.split('/')[-1].split('.')[0]
print('Initial names: {}'.format(inputListInitialNames),'Mapped id: {}'.format(inputList))
labels = []
results = ddict(list)
# Only run the first network for testing
# The Largest network requires 256GB of memory
networkNames = [networkNames[0]]
for algorithm, algorithmName in zip(algorithms, algorithmNames):
for i in range(len(networkNames)):
currQueryMode = selectedQueryModes[i]
currPredictionMode = selectedPredictionModes[i]
print 'Running Prospective for {} using {}'.format(experiment, algorithmName)
previous = time()
currNet, _removedNodes = generateDegree1net(
conns[i], paths.NETWORK_MAPPING_FILE, removedNodes)
currName = networkNames[i]
later = time()
print('Graph generation took {} min'.format((later - previous) / 60))
previous = time()
###################################################################
print(currNet)
print('Generating prospective object for {}'.format(currName))
validation = prospective(name=inputPrefix, algorithm=algorithm, network=currNet,
selectedEntityList=inputList, selectedEntityInitName=inputListInitialNames,
selectedPredictionMode=currPredictionMode,
selectedQueryMode=currQueryMode,
mappingFile=paths.NETWORK_MAPPING_FILE, FDAdrugsFile=FDAdrugsFile,
MeSH_name_file1=MeSH_name_file1, MeSH_name_file2=MeSH_name_file2,
splitPerformanceByEntity=True)
validation.perform()
currentResult = validation.getResults()
labels.append("%s %s" % (algorithmName, currName))
for x in (validation.savedVars):
results[x].append(currentResult[x])
# Produce a dataframe for plotting bootstrapped AUC median
data = pd.DataFrame(results['AUC_bootstrapMedian'], columns = inputListInitialNames, index=labels).T
# Plot a heatmap of bootstrapped AUC median
%matplotlib inline
plt.figure(figsize=(6,7))
sns.set(font_scale=2)
sns.heatmap(data=data, cmap = 'YlGnBu', vmin=0.6,vmax=0.9)
plt.xticks(rotation=90)
# Load one of the predictions for visualization
pred_files = glob(paths.VALIDATION_RESULT_DIR+'/prospective/AptRank1_CTD2016DG_Degree_1*/*_pred.tsv')
df = pd.read_csv(pred_files[0], sep="\t")
sortby = 'Breast Neoplasms'
df.index=df['Synonyms']
# Subset the predictions which are not known in the networks
df_score = df.loc[df['known_'+sortby]==0]
col_score = [x for x in df.columns if 'score_' in x]
df_score = df_score[col_score]
df_score.columns = [x.replace('score_','') for x in df_score.columns]
df_score = df_score.sort_values(sortby)
# df_score.tail().iloc[::-1]
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
trace = go.Heatmap(z=df_score.values,
x=df_score.columns,
y=df_score.index,
reversescale=True,
colorscale=[[0, 'rgb(165,0,38)'],
[0.91111111111111111, 'rgb(215,48,39)'],
[0.92222222222222222, 'rgb(244,109,67)'],
[0.93333333333333333, 'rgb(253,174,97)'],
[0.94444444444444444, 'rgb(254,224,144)'],
[0.95555555555555556, 'rgb(224,243,248)'],
[0.96666666666666666, 'rgb(171,217,233)'],
[0.97777777777777778, 'rgb(116,173,209)'],
[0.98888888888888888, 'rgb(69,117,180)'],
[1, 'rgb(49,54,149)']])
data=[trace]
iplot(data, filename='labelled-heatmap')