In [145]:
%load_ext autoreload
from importlib import reload
    
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
from matplotlib.cm import Greys


%matplotlib inline  
import numpy as np

from sklearn.manifold import TSNE
from sklearn.metrics import pairwise
import sklearn as sk
import community
import seaborn as sns

import networkx as nx
import pandas as pd
import subprocess
import sys
import pickle
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [181]:
dirTemp = "temp/" #directory for manipulating graphs and embedding
#embeddingFile = dirTemp+"node2vec.emb"
#embeddingFile = dirTemp+"struc2vec.emb"
#embeddingFile = dirTemp+"LE.emb"
embeddingFile = dirTemp+"custom.emb"



G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )

#We use a specific loading function to handle files with nodes in non consecutive order
#theEmbedding = np.loadtxt(open(embeddingFile, "rb"), delimiter=" ", skiprows=1)
def loadEmbedding(file_name):
    with open(file_name, 'r') as f:
        n, d = f.readline().strip().split()
        X = np.zeros((int(n), int(d)))
        for line in f:
            emb = line.strip().split()
            emb_fl = [float(emb_i) for emb_i in emb[1:]]
            X[int(float(emb[0])), :] = emb_fl
    return X
theEmbedding = loadEmbedding(embeddingFile)

Computing "traditional" features on our graph

In [182]:
btw = nx.betweenness_centrality_source(G)
deg = nx.degree(G)
close = nx.closeness_centrality(G)
coms = community.best_partition(graph=G)

namesDict = nx.get_node_attributes(G,"name")
In [183]:
features = pd.DataFrame(theEmbedding)
features["name"]=[v for k,v in sorted(namesDict.items())]
features["btw"]= [v for k,v in sorted(btw.items())]
features["deg"]=[v for k,v in sorted(deg.items())]
features["close"]=[v for k,v in sorted(close.items())]
features["coms"]=[v for k,v in sorted(coms.items())]


pd.set_option('precision',2)
features.head(20)
Out[183]:
0 1 2 3 4 5 6 7 name btw deg close coms
0 -0.51 1.29 0.61 -0.08 -0.53 -0.13 1.24 8.33e-01 Jaime-Lannister 8.57e-03 14 0.48 0
1 0.16 1.76 0.09 0.40 -0.39 0.57 1.87 8.13e-01 Tywin-Lannister 4.91e-03 9 0.47 0
2 -0.03 0.18 1.79 -1.20 0.69 -0.98 0.38 -4.14e-01 Daenerys-Targaryen 2.13e-01 16 0.46 3
3 -0.98 -0.60 0.46 -0.86 0.97 -0.95 0.08 2.24e-01 Eddard-Stark 2.45e-01 36 0.65 1
4 -0.95 0.62 1.91 2.74 1.68 -0.23 -0.30 -1.53e+00 Aemon-Targaryen-(Maester-Aemon) 0.00e+00 3 0.35 4
5 -0.95 0.85 2.15 1.29 1.17 -0.90 0.13 -1.16e+00 Alliser-Thorne 9.46e-04 4 0.38 4
6 -0.10 1.87 1.63 2.40 -0.56 0.40 0.46 2.28e-01 Jeor-Mormont 2.72e-03 6 0.38 4
7 0.85 -0.66 -0.56 -0.92 0.88 0.42 2.22 -1.48e+00 Jon-Snow 1.62e-01 21 0.53 4
8 0.64 0.97 1.85 2.53 -0.15 0.52 -1.03 7.50e-03 Samwell-Tarly 1.53e-03 7 0.36 4
9 -1.41 0.64 -0.33 0.32 1.95 1.04 1.03 -2.48e-01 Aerys-II-Targaryen 0.00e+00 3 0.41 1
10 -1.99 0.22 0.98 0.50 0.38 -0.76 -0.88 3.44e+00 Brandon-Stark 0.00e+00 3 0.40 1
11 -0.68 0.42 -0.69 1.14 1.87 -0.17 -0.41 1.09e+00 Jon-Arryn 3.03e-03 11 0.47 1
12 0.56 1.16 -0.35 1.07 -0.43 1.79 -1.19 8.35e-01 Robert-Baratheon 2.02e-01 30 0.62 1
13 0.06 1.63 0.64 0.29 2.56 0.35 -1.65 -1.11e-01 Aggo 0.00e+00 3 0.32 3
14 -0.90 0.67 -1.62 -1.54 -0.11 0.53 -1.12 7.08e-01 Drogo 7.01e-03 13 0.33 3
15 0.46 1.94 0.67 0.73 1.01 0.47 -1.57 1.21e+00 Jhogo 7.91e-05 4 0.32 3
16 1.00 0.62 -1.42 -0.37 1.44 1.42 -0.41 1.81e-01 Jorah-Mormont 3.41e-03 9 0.33 3
17 1.37 1.33 2.19 1.57 0.80 0.36 -0.48 -3.43e-01 Halder 0.00e+00 4 0.35 4
18 0.64 -0.54 1.50 2.12 1.96 0.30 -1.53 -1.51e+00 Grenn 2.11e-04 5 0.36 4
19 0.81 1.07 2.77 1.01 -1.09 0.36 -0.14 -3.23e-01 Pypar 2.11e-04 5 0.36 4

Distribution of features

We can explore the distribution of values of each feature

In [184]:
#feature = "deg"
feature = 4
plt.rcParams['figure.figsize'] = [7, 7]
sns.distplot(features[feature])
print("mean: ",np.average(features[feature]),"std", np.std(features[feature]))
mean:  0.4867060049051196 std 1.089879105379009

Matrix of correlation between features

We want to have a general idea of the correlation between the features and other node properties, and between the features themselves

In [185]:
corrmat = features.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

Intuition of correlation

We can explore visually how two features are correlated.

You can change the chosen features on the first line

In [186]:
feature1 = 4
feature2 = "deg"
features.plot.scatter(x=feature1, y=feature2)
plt.show()

Embedding and community structure

We evaluate if the embeddings are correlated with the community structre.

We plot, for each feature, for each community, the average value for this feature of nodes in the community

If a feature is orthogonal with the community structure, their average value should be similar

In [187]:
fig = plt.figure()
for i in range(8):
    plt.subplot(2, 4, i+1)
    x="coms"
    y=i
    data = pd.concat([features[x], features[y]], axis=1)
    #f = sns.violinplot(x=x, y=y, data=data,)
    f = sns.boxplot(x=x, y=y, data=data,)
    fig.set_size_inches(15,7)

    #plt.ylim((0.5,-0.5))
plt.show()

Analyzing outliers

It could be interesting to check which nodes are among "outliers", i.e. have values far from all other nodes in one direction or another one

In [188]:
featureToStudy = 4
def getOutliers(listValue,listNames):
    mean = np.average(listValue)
    std = np.std(listValue)
    print("----TOP---",mean,std)
    print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]>mean+1.5*std])
    print("----BOTTOM---")
    print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]<mean-1.5*std])
getOutliers(features[featureToStudy],features["name"])
----TOP--- 0.4867060049051196 1.089879105379009
[('Aggo', 2.5570593567177538), ('Alyn', 2.626114932289836), ('Hoster-Tully', 2.1725827978261543), ('Qotho', 2.2352921638830034), ('Illyrio-Mopatis', 2.892766796535084), ('Rhaegar-Targaryen', 2.3584554366094292)]
----BOTTOM---
[('Gregor-Clegane', -1.501969947766534), ('Loras-Tyrell', -2.0649246180021166), ('Shagga', -1.4333272497175193), ('Jon-Umber-(Greatjon)', -2.1222551172369775), ('Conn', -1.5449700691683836)]

Distance in the embedding

We study the correlation between the distance in the graph (shortest path length) and the distance in the embedding.

The way to measure the distance in the embedding depends on the embedding itself.

If not sure, we can change this distance on the first line

In [189]:
#embDist = pairwise.cosine_distances(theEmbedding)
#embDist = pairwise.manhattan_distances(theEmbedding)
embDist = pairwise.euclidean_distances(theEmbedding)

plt.rcParams['figure.figsize'] = [10, 10]
dictNames = nx.get_node_attributes(G,"name")
graphDist = nx.shortest_path_length(G)
graphDist = {frozenset([i,j]):graphDist[i][j] for i in graphDist for j in graphDist[i] if i!=j}
embDist = {frozenset([i,j]):embDist[i][j] for i in range(len(embDist)) for j in range(len(embDist)) if i!=j}
x=[]
y=[]
for key in graphDist:
    #print([dictNames[k] for k in key],graphDist[key],embDist[key])
    x.append(graphDist[key])
    y.append(embDist[key])
sns.boxplot(x,y)
#plt.scatter(x, y)
np.corrcoef(x,y)
Out[189]:
array([[1.        , 0.43143685],
       [0.43143685, 1.        ]])

Preservation of the network structure

We can use a technique called TSNE (t-distributed Stochastic Neighbor Embedding.) to convert the n-dimensional embedding in fast 2D embedding that tries to preserve the original distance between nodes

We plot the graph with the nodes positioned according to this second embedding.

It helps us to understand intuitively what has been captured by the embedding.

The color of nodes correspond to communities found by Louvain

In [190]:
feature = "coms"
nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
num_categories = len(cats)
colors = [viridis(cats.index(features[feature][i])/num_categories) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)

pos = {}
for i in range(nbNodes):
    pos[i] = node_pos[i, :]
In [191]:
plt.rcParams['figure.figsize'] = [17, 10]


nx.draw_networkx(G, pos,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))

#cbar.ax.set_yticklabels(['< -1', '0', '> 1'])  # vertically oriented colorbar
plt.show()
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:543: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if cb.is_string_like(edge_color) or len(edge_color) == 1:
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):

Visual intuition of features

We change the meaning of colors: they now represent the value of a single feature.

The position of nodes can come from the same solution as before or a traditional force based layout (firt lines)

In [192]:
#feature = "deg"
feature = 1
positionForceLayout=True


nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
normalized = sk.preprocessing.minmax_scale(features[feature])
colors = [Greys(normalized[i]) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)

pos = {}
for i in range(nbNodes):
    pos[i] = node_pos[i, :]
In [193]:
plt.rcParams['figure.figsize'] = [17, 10]
fig.set_size_inches(15,7)

if positionForceLayout:
    nx.draw_networkx(G,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
else:
    nx.draw_networkx(G,pos,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
sm = plt.cm.ScalarMappable(cmap=Greys, norm=plt.Normalize(vmin=0, vmax=1))
sm._A = []
plt.colorbar(sm)
plt.show()
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:543: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if cb.is_string_like(edge_color) or len(edge_color) == 1:
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):