%load_ext autoreload
from importlib import reload
    
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
from matplotlib.cm import Greys


%matplotlib inline  
import numpy as np

from sklearn.manifold import TSNE
from sklearn.metrics import pairwise
import sklearn as sk
import community
import seaborn as sns

import networkx as nx
import pandas as pd
import subprocess
import sys
import pickle

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

dirTemp = "temp/" #directory for manipulating graphs and embedding
#embeddingFile = dirTemp+"node2vec.emb"
#embeddingFile = dirTemp+"struc2vec.emb"
#embeddingFile = dirTemp+"LE.emb"
embeddingFile = dirTemp+"custom.emb"



G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )

#We use a specific loading function to handle files with nodes in non consecutive order
#theEmbedding = np.loadtxt(open(embeddingFile, "rb"), delimiter=" ", skiprows=1)
def loadEmbedding(file_name):
    with open(file_name, 'r') as f:
        n, d = f.readline().strip().split()
        X = np.zeros((int(n), int(d)))
        for line in f:
            emb = line.strip().split()
            emb_fl = [float(emb_i) for emb_i in emb[1:]]
            X[int(float(emb[0])), :] = emb_fl
    return X
theEmbedding = loadEmbedding(embeddingFile)

Computing "traditional" features on our graph¶

btw = nx.betweenness_centrality_source(G)
deg = nx.degree(G)
close = nx.closeness_centrality(G)
coms = community.best_partition(graph=G)

namesDict = nx.get_node_attributes(G,"name")

features = pd.DataFrame(theEmbedding)
features["name"]=[v for k,v in sorted(namesDict.items())]
features["btw"]= [v for k,v in sorted(btw.items())]
features["deg"]=[v for k,v in sorted(deg.items())]
features["close"]=[v for k,v in sorted(close.items())]
features["coms"]=[v for k,v in sorted(coms.items())]


pd.set_option('precision',2)
features.head(20)

Distribution of features¶

We can explore the distribution of values of each feature

#feature = "deg"
feature = 4
plt.rcParams['figure.figsize'] = [7, 7]
sns.distplot(features[feature])
print("mean: ",np.average(features[feature]),"std", np.std(features[feature]))

mean:  0.4867060049051196 std 1.089879105379009

Matrix of correlation between features¶

We want to have a general idea of the correlation between the features and other node properties, and between the features themselves

corrmat = features.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

Intuition of correlation¶

We can explore visually how two features are correlated.

You can change the chosen features on the first line

feature1 = 4
feature2 = "deg"
features.plot.scatter(x=feature1, y=feature2)
plt.show()

Embedding and community structure¶

We evaluate if the embeddings are correlated with the community structre.

We plot, for each feature, for each community, the average value for this feature of nodes in the community

If a feature is orthogonal with the community structure, their average value should be similar

fig = plt.figure()
for i in range(8):
    plt.subplot(2, 4, i+1)
    x="coms"
    y=i
    data = pd.concat([features[x], features[y]], axis=1)
    #f = sns.violinplot(x=x, y=y, data=data,)
    f = sns.boxplot(x=x, y=y, data=data,)
    fig.set_size_inches(15,7)

    #plt.ylim((0.5,-0.5))
plt.show()

Analyzing outliers¶

It could be interesting to check which nodes are among "outliers", i.e. have values far from all other nodes in one direction or another one

featureToStudy = 4
def getOutliers(listValue,listNames):
    mean = np.average(listValue)
    std = np.std(listValue)
    print("----TOP---",mean,std)
    print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]>mean+1.5*std])
    print("----BOTTOM---")
    print([(listNames[i],listValue[i]) for i in range(len(listValue)) if listValue[i]<mean-1.5*std])
getOutliers(features[featureToStudy],features["name"])

----TOP--- 0.4867060049051196 1.089879105379009
[('Aggo', 2.5570593567177538), ('Alyn', 2.626114932289836), ('Hoster-Tully', 2.1725827978261543), ('Qotho', 2.2352921638830034), ('Illyrio-Mopatis', 2.892766796535084), ('Rhaegar-Targaryen', 2.3584554366094292)]
----BOTTOM---
[('Gregor-Clegane', -1.501969947766534), ('Loras-Tyrell', -2.0649246180021166), ('Shagga', -1.4333272497175193), ('Jon-Umber-(Greatjon)', -2.1222551172369775), ('Conn', -1.5449700691683836)]

Distance in the embedding¶

We study the correlation between the distance in the graph (shortest path length) and the distance in the embedding.

The way to measure the distance in the embedding depends on the embedding itself.

If not sure, we can change this distance on the first line

#embDist = pairwise.cosine_distances(theEmbedding)
#embDist = pairwise.manhattan_distances(theEmbedding)
embDist = pairwise.euclidean_distances(theEmbedding)

plt.rcParams['figure.figsize'] = [10, 10]
dictNames = nx.get_node_attributes(G,"name")
graphDist = nx.shortest_path_length(G)
graphDist = {frozenset([i,j]):graphDist[i][j] for i in graphDist for j in graphDist[i] if i!=j}
embDist = {frozenset([i,j]):embDist[i][j] for i in range(len(embDist)) for j in range(len(embDist)) if i!=j}
x=[]
y=[]
for key in graphDist:
    #print([dictNames[k] for k in key],graphDist[key],embDist[key])
    x.append(graphDist[key])
    y.append(embDist[key])
sns.boxplot(x,y)
#plt.scatter(x, y)
np.corrcoef(x,y)

array([[1.        , 0.43143685],
       [0.43143685, 1.        ]])

Preservation of the network structure¶

We can use a technique called TSNE (t-distributed Stochastic Neighbor Embedding.) to convert the n-dimensional embedding in fast 2D embedding that tries to preserve the original distance between nodes

We plot the graph with the nodes positioned according to this second embedding.

It helps us to understand intuitively what has been captured by the embedding.

The color of nodes correspond to communities found by Louvain

feature = "coms"
nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
num_categories = len(cats)
colors = [viridis(cats.index(features[feature][i])/num_categories) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)

pos = {}
for i in range(nbNodes):
    pos[i] = node_pos[i, :]

plt.rcParams['figure.figsize'] = [17, 10]


nx.draw_networkx(G, pos,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))

#cbar.ax.set_yticklabels(['< -1', '0', '> 1'])  # vertically oriented colorbar
plt.show()

/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:543: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if cb.is_string_like(edge_color) or len(edge_color) == 1:
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):

Visual intuition of features¶

We change the meaning of colors: they now represent the value of a single feature.

The position of nodes can come from the same solution as before or a traditional force based layout (firt lines)

#feature = "deg"
feature = 1
positionForceLayout=True


nbNodes = G.number_of_nodes()
cats = list(set(features[feature]))
normalized = sk.preprocessing.minmax_scale(features[feature])
colors = [Greys(normalized[i]) for i in range(nbNodes)]
model = TSNE(n_components=2)
node_pos = theEmbedding
node_pos = model.fit_transform(node_pos)

pos = {}
for i in range(nbNodes):
    pos[i] = node_pos[i, :]

plt.rcParams['figure.figsize'] = [17, 10]
fig.set_size_inches(15,7)

if positionForceLayout:
    nx.draw_networkx(G,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
else:
    nx.draw_networkx(G,pos,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
sm = plt.cm.ScalarMappable(cmap=Greys, norm=plt.Normalize(vmin=0, vmax=1))
sm._A = []
plt.colorbar(sm)
plt.show()

/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:543: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if cb.is_string_like(edge_color) or len(edge_color) == 1:
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):

	0	1	2	3	4	5	6	7	name	btw	deg	close	coms
0	-0.51	1.29	0.61	-0.08	-0.53	-0.13	1.24	8.33e-01	Jaime-Lannister	8.57e-03	14	0.48	0
1	0.16	1.76	0.09	0.40	-0.39	0.57	1.87	8.13e-01	Tywin-Lannister	4.91e-03	9	0.47	0
2	-0.03	0.18	1.79	-1.20	0.69	-0.98	0.38	-4.14e-01	Daenerys-Targaryen	2.13e-01	16	0.46	3
3	-0.98	-0.60	0.46	-0.86	0.97	-0.95	0.08	2.24e-01	Eddard-Stark	2.45e-01	36	0.65	1
4	-0.95	0.62	1.91	2.74	1.68	-0.23	-0.30	-1.53e+00	Aemon-Targaryen-(Maester-Aemon)	0.00e+00	3	0.35	4
5	-0.95	0.85	2.15	1.29	1.17	-0.90	0.13	-1.16e+00	Alliser-Thorne	9.46e-04	4	0.38	4
6	-0.10	1.87	1.63	2.40	-0.56	0.40	0.46	2.28e-01	Jeor-Mormont	2.72e-03	6	0.38	4
7	0.85	-0.66	-0.56	-0.92	0.88	0.42	2.22	-1.48e+00	Jon-Snow	1.62e-01	21	0.53	4
8	0.64	0.97	1.85	2.53	-0.15	0.52	-1.03	7.50e-03	Samwell-Tarly	1.53e-03	7	0.36	4
9	-1.41	0.64	-0.33	0.32	1.95	1.04	1.03	-2.48e-01	Aerys-II-Targaryen	0.00e+00	3	0.41	1
10	-1.99	0.22	0.98	0.50	0.38	-0.76	-0.88	3.44e+00	Brandon-Stark	0.00e+00	3	0.40	1
11	-0.68	0.42	-0.69	1.14	1.87	-0.17	-0.41	1.09e+00	Jon-Arryn	3.03e-03	11	0.47	1
12	0.56	1.16	-0.35	1.07	-0.43	1.79	-1.19	8.35e-01	Robert-Baratheon	2.02e-01	30	0.62	1
13	0.06	1.63	0.64	0.29	2.56	0.35	-1.65	-1.11e-01	Aggo	0.00e+00	3	0.32	3
14	-0.90	0.67	-1.62	-1.54	-0.11	0.53	-1.12	7.08e-01	Drogo	7.01e-03	13	0.33	3
15	0.46	1.94	0.67	0.73	1.01	0.47	-1.57	1.21e+00	Jhogo	7.91e-05	4	0.32	3
16	1.00	0.62	-1.42	-0.37	1.44	1.42	-0.41	1.81e-01	Jorah-Mormont	3.41e-03	9	0.33	3
17	1.37	1.33	2.19	1.57	0.80	0.36	-0.48	-3.43e-01	Halder	0.00e+00	4	0.35	4
18	0.64	-0.54	1.50	2.12	1.96	0.30	-1.53	-1.51e+00	Grenn	2.11e-04	5	0.36	4
19	0.81	1.07	2.77	1.01	-1.09	0.36	-0.14	-3.23e-01	Pypar	2.11e-04	5	0.36	4