%load_ext autoreload
from importlib import reload
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
from matplotlib.cm import Greys
%matplotlib inline
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise
import sklearn as sk
import community
import seaborn as sns
import networkx as nx
import pandas as pd
import subprocess
import sys
import pickle
dirTemp = "temp/" #directory for manipulating graphs and embedding
#embeddingFile = dirTemp+"node2vec.emb"
#embeddingFile = dirTemp+"struc2vec.emb"
#embeddingFile = dirTemp+"LE.emb"
embeddingFile = dirTemp+"custom.emb"
G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )
#We use a specific loading function to handle files with nodes in non consecutive order
#theEmbedding = np.loadtxt(open(embeddingFile, "rb"), delimiter=" ", skiprows=1)
def loadEmbedding(file_name):
with open(file_name, 'r') as f:
n, d = f.readline().strip().split()
X = np.zeros((int(n), int(d)))
for line in f:
emb = line.strip().split()
emb_fl = [float(emb_i) for emb_i in emb[1:]]
X[int(float(emb[0])), :] = emb_fl
return X
theEmbedding = loadEmbedding(embeddingFile)
btw = nx.betweenness_centrality_source(G)
deg = nx.degree(G)
close = nx.closeness_centrality(G)
coms = community.best_partition(graph=G)
namesDict = nx.get_node_attributes(G,"name")
features = pd.DataFrame(theEmbedding)
features["name"]=[v for k,v in sorted(namesDict.items())]
features["btw"]= [v for k,v in sorted(btw.items())]
features["deg"]=[v for k,v in sorted(deg.items())]
features["close"]=[v for k,v in sorted(close.items())]
features["coms"]=[v for k,v in sorted(coms.items())]
pd.set_option('precision',2)
features.head(20)
We can explore the distribution of values of each feature
#feature = "deg"
feature = 4
plt.rcParams['figure.figsize'] = [7, 7]
sns.distplot(features[feature])
print("mean: ",np.average(features[feature]),"std", np.std(features[feature]))
We want to have a general idea of the correlation between the features and other node properties, and between the features themselves
corrmat = features.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
We can explore visually how two features are correlated.
You can change the chosen features on the first line
feature1 = 4
feature2 = "deg"
features.plot.scatter(x=feature1, y=feature2)
plt.show()
We evaluate if the embeddings are correlated with the community structre.
We plot, for each feature, for each community, the average value for this feature of nodes in the community
If a feature is orthogonal with the community structure, their average value should be similar
fig = plt.figure()
for i in range(8):
plt.subplot(2, 4, i+1)
x="coms"
y=i
data = pd.concat([features[x], features[y]], axis=1)
#f = sns.violinplot(x=x, y=y, data=data,)
f = sns.boxplot(x=x, y=y, data=data,)
fig.set_size_inches(15,7)
#plt.ylim((0.5,-0.5))
plt.show()
It could be interesting to check which nodes are among "outliers", i.e. have values far from all other nodes in one direction or another one
featureToStudy = 4
def getOutliers(listValue,listNames):
mean = np.average(listValue)
std = np.std(listValue)
print("----TOP---",mean,std)
print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]>mean+1.5*std])
print("----BOTTOM---")
print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]<mean-1.5*std])
getOutliers(features[featureToStudy],features["name"])
We study the correlation between the distance in the graph (shortest path length) and the distance in the embedding.
The way to measure the distance in the embedding depends on the embedding itself.
If not sure, we can change this distance on the first line
#embDist = pairwise.cosine_distances(theEmbedding)
#embDist = pairwise.manhattan_distances(theEmbedding)
embDist = pairwise.euclidean_distances(theEmbedding)
plt.rcParams['figure.figsize'] = [10, 10]
dictNames = nx.get_node_attributes(G,"name")
graphDist = nx.shortest_path_length(G)
graphDist = {frozenset([i,j]):graphDist[i][j] for i in graphDist for j in graphDist[i] if i!=j}
embDist = {frozenset([i,j]):embDist[i][j] for i in range(len(embDist)) for j in range(len(embDist)) if i!=j}
x=[]
y=[]
for key in graphDist:
#print([dictNames[k] for k in key],graphDist[key],embDist[key])
x.append(graphDist[key])
y.append(embDist[key])
sns.boxplot(x,y)
#plt.scatter(x, y)
np.corrcoef(x,y)
We can use a technique called TSNE (t-distributed Stochastic Neighbor Embedding.) to convert the n-dimensional embedding in fast 2D embedding that tries to preserve the original distance between nodes
We plot the graph with the nodes positioned according to this second embedding.
It helps us to understand intuitively what has been captured by the embedding.
The color of nodes correspond to communities found by Louvain
feature = "coms"
nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
num_categories = len(cats)
colors = [viridis(cats.index(features[feature][i])/num_categories) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)
pos = {}
for i in range(nbNodes):
pos[i] = node_pos[i, :]
plt.rcParams['figure.figsize'] = [17, 10]
nx.draw_networkx(G, pos,
node_color=colors,nodelist=range(nbNodes),
width=0.1, node_size=500,
arrows=False, alpha=0.8,
font_size=10,labels=nx.get_node_attributes(G,"name"))
#cbar.ax.set_yticklabels(['< -1', '0', '> 1']) # vertically oriented colorbar
plt.show()
We change the meaning of colors: they now represent the value of a single feature.
The position of nodes can come from the same solution as before or a traditional force based layout (firt lines)
#feature = "deg"
feature = 1
positionForceLayout=True
nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
normalized = sk.preprocessing.minmax_scale(features[feature])
colors = [Greys(normalized[i]) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)
pos = {}
for i in range(nbNodes):
pos[i] = node_pos[i, :]
plt.rcParams['figure.figsize'] = [17, 10]
fig.set_size_inches(15,7)
if positionForceLayout:
nx.draw_networkx(G,
node_color=colors,nodelist=range(nbNodes),
width=0.1, node_size=500,
arrows=False, alpha=0.8,
font_size=10,labels=nx.get_node_attributes(G,"name"))
else:
nx.draw_networkx(G,pos,
node_color=colors,nodelist=range(nbNodes),
width=0.1, node_size=500,
arrows=False, alpha=0.8,
font_size=10,labels=nx.get_node_attributes(G,"name"))
sm = plt.cm.ScalarMappable(cmap=Greys, norm=plt.Normalize(vmin=0, vmax=1))
sm._A = []
plt.colorbar(sm)
plt.show()