In [22]:
%load_ext autoreload
from importlib import reload
    
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
from matplotlib.cm import Greys


%matplotlib inline  
import numpy as np

from sklearn.manifold import TSNE
from sklearn.metrics import pairwise
import sklearn as sk
import community
import seaborn as sns

import networkx as nx
import pandas as pd
import subprocess
import sys
import pickle
from operator import itemgetter
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Loading a network and its corresponding embedding

In [24]:
dirTemp = "temp/" #directory for manipulating graphs and embedding
#embeddingFile = "temp/node2vec.emb"
#embeddingFile = "temp/struc2vec.emb"
embeddingFile = "temp/custom.emb"


G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )
def loadEmbedding(file_name):
    with open(file_name, 'r') as f:
        n, d = f.readline().strip().split()
        X = np.zeros((int(n), int(d)))
        for line in f:
            emb = line.strip().split()
            emb_fl = [float(emb_i) for emb_i in emb[1:]]
            X[int(float(emb[0])), :] = emb_fl
    return X
theEmbedding = loadEmbedding(embeddingFile)

Unsupervised link prediction

1)Compute the distance between nodes in the embedding

2)The shorter this distance, the most propable a link

In [25]:
embDist = pairwise.cosine_distances(theEmbedding)
#embDist = pairwise.euclidean_distances(theEmbedding)
embDist = {frozenset([i,j]):embDist[i][j] for i in range(len(embDist)) for j in range(len(embDist)) if i!=j}
In [26]:
edges = G.edges()
print("nb edges: ",len(edges))
edges = {frozenset(e) for e in edges}
print("pairs of nodes: ",len(embDist))
missingEdgesScore = {k:v for k,v in embDist.items() if not k in edges}
print("potential edges: ",len(missingEdgesScore))
nb edges:  340
pairs of nodes:  3240
potential edges:  2900
In [27]:
names= nx.get_node_attributes(G,"name")
sortedPredictions = sorted(((value, key) for (key,value) in missingEdgesScore.items()),reverse=False)
sortedPredictions = [(score,IDs,(names[list(IDs)[0]],names[list(IDs)[1]])) for score,IDs in sortedPredictions]
sortedPredictions[:10]
Out[27]:
[(0.009818455000647708,
  frozenset({62, 63}),
  ('Brynden-Tully', 'Edmure-Tully')),
 (0.04598489571509412,
  frozenset({1, 9}),
  ('Tywin-Lannister', 'Aerys-II-Targaryen')),
 (0.07337936250431443, frozenset({13, 15}), ('Aggo', 'Jhogo')),
 (0.07555526916130728, frozenset({21, 23}), ('Alyn', 'Tomard')),
 (0.09221853178812889,
  frozenset({9, 10}),
  ('Aerys-II-Targaryen', 'Brandon-Stark')),
 (0.09640725127267957, frozenset({37, 55}), ('Rickon-Stark', 'Hodor')),
 (0.10494281863042809,
  frozenset({26, 35}),
  ('Bran-Stark', 'Myrcella-Baratheon')),
 (0.1052195194810267, frozenset({64, 65}), ('Hoster-Tully', 'Lysa-Arryn')),
 (0.10634770997817156, frozenset({54, 63}), ('Hallis-Mollen', 'Edmure-Tully')),
 (0.10730691546734239, frozenset({42, 46}), ('Syrio-Forel', 'Boros-Blount'))]
In [28]:
selectedEdgesUnsupervised = [e for _,e,_ in sortedPredictions[:10]]

Supervised link prediction

1)Combine node embeddings to obtain pair-of-nodes embeddings

In [29]:
def hadamard1(vi, vj):
    prod=np.multiply(vi, vj)
    return prod

def hadamard2(vi, vj):
    prod=np.multiply(vi, vj)/(np.linalg.norm(vi)*np.linalg.norm(vj))
    return prod

#combining the embeddings
#combinedEmbedding = dict()
edgesEmbeddings = {}
nonEdgesEmbeddings = {}
for i in range(len(theEmbedding)):
    for j in range(i+1,len(theEmbedding)):
        if i!=j:
            combinedEmb = hadamard2(theEmbedding[i],theEmbedding[j])
            if  G.has_edge(i,j):
                edgesEmbeddings[frozenset([i,j])]=combinedEmb
            else:
                nonEdgesEmbeddings[frozenset([i,j])]=combinedEmb

2)Train a logistic classifier from available positive examples and a sample of negative examples

In [30]:
trainingSet = [(k,edgesEmbeddings[k],True) for k in edgesEmbeddings.keys()]
print("positive examples: ",len(trainingSet))

randomKeys = np.random.choice(list(nonEdgesEmbeddings.keys()),len(edgesEmbeddings),replace=False)
trainingSet += [(k,nonEdgesEmbeddings[k],False) for k in randomKeys]
print("all examples: ",len(trainingSet))

logistic = sk.linear_model.LogisticRegression()
logistic.fit([x for (_,x,l) in trainingSet],[l for (_,x,l) in trainingSet])
nonEdgesList = list(nonEdgesEmbeddings.keys())
scores = logistic.decision_function([nonEdgesEmbeddings[k] for k in nonEdgesList])
sortedPredictions = sorted([(scores[i],nonEdgesList[i])for i in range(len(scores))],reverse=True)
positive examples:  340
all examples:  680

Get the 10 most probable edges

In [31]:
names= nx.get_node_attributes(G,"name")
sortedPredictions = [(score,IDs,(names[list(IDs)[0]],names[list(IDs)[1]])) for score,IDs in sortedPredictions]
selectedEdgesSupervised = [e for _,e,_ in sortedPredictions[:10]]
sortedPredictions[:10]
Out[31]:
[(1.4848168613448642,
  frozenset({12, 46}),
  ('Robert-Baratheon', 'Boros-Blount')),
 (1.376278306612967, frozenset({9, 75}), ('Aerys-II-Targaryen', 'Irri')),
 (1.3650048748297818,
  frozenset({65, 67}),
  ('Lysa-Arryn', 'Jon-Umber-(Greatjon)')),
 (1.3517883369107702, frozenset({10, 32}), ('Meryn-Trant', 'Brandon-Stark')),
 (1.305252755307228, frozenset({18, 46}), ('Grenn', 'Boros-Blount')),
 (1.3049362016512536, frozenset({45, 75}), ('Irri', 'Barristan-Selmy')),
 (1.2844621595080878,
  frozenset({12, 53}),
  ('Robert-Baratheon', 'Loras-Tyrell')),
 (1.2769638792085405, frozenset({32, 40}), ('Meryn-Trant', 'Sandor-Clegane')),
 (1.2648704873408332, frozenset({53, 62}), ('Loras-Tyrell', 'Brynden-Tully')),
 (1.2550792796555654, frozenset({32, 47}), ('Meryn-Trant', 'Pycelle'))]

Plotting predicted edges

In [32]:
selectedEdges = selectedEdgesUnsupervised
#selectedEdges = selectedEdgesSupervised



tempGraph = G.copy()
tempGraph.add_edges_from(selectedEdges)
colors = []
for e in tempGraph.edges():
    if frozenset(e) in selectedEdges:
        colors.append("blue")
    else:
        colors.append("grey")
plt.rcParams['figure.figsize'] = [17, 10]
nx.draw_networkx(tempGraph, edge_color=colors)
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:526: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  for c in edge_color]):
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):

A (naive) example of label prediction

Often, datasets have "labels" on nodes, properties that characterize them.

In this toy dataset, we can use the "family name" of characters as an attribute.

The goal would be to predict which characters would be the most likely to also be part of the family.

We start by identifying characters of a class

In [33]:
names = nx.get_node_attributes(G,"name")
stark = [(k,n,True) for k,n in names.items() if "Stark" in n]
stark
Out[33]:
[(27, 'Catelyn-Stark', True),
 (3, 'Eddard-Stark', True),
 (38, 'Robb-Stark', True),
 (24, 'Arya-Stark', True),
 (26, 'Bran-Stark', True),
 (10, 'Brandon-Stark', True),
 (41, 'Sansa-Stark', True),
 (25, 'Benjen-Stark', True),
 (37, 'Rickon-Stark', True)]

We train a logistic classifier to recognize family name from node features (directly)

An plot the most likely candidates

In [34]:
nonStark = [(k,v,False) for k,v in names.items() if k not in [v for v,_,_ in stark]]
logistic = sk.linear_model.LogisticRegression()
logistic.fit([theEmbedding[i] for i,_,_ in stark+nonStark],[l for _,_,l in stark+nonStark])
scores = logistic.decision_function([theEmbedding[i] for (i,_,_) in nonStark])
sortedPredictions = sorted([(scores[i],nonStark[i])for i in range(len(scores))],reverse=True)
sortedPredictions[:10]
Out[34]:
[(-0.7039584892255808, (80, 'Todder', False)),
 (-0.7066667530365709, (8, 'Samwell-Tarly', False)),
 (-0.742349905515772, (20, 'Tyrion-Lannister', False)),
 (-0.8440833769788808, (67, 'Jon-Umber-(Greatjon)', False)),
 (-0.9146646108586127, (64, 'Hoster-Tully', False)),
 (-0.9408125684642789, (50, 'Theon-Greyjoy', False)),
 (-1.0258221626884911, (70, 'Haggo', False)),
 (-1.0763994957704113, (76, 'Jhiqui', False)),
 (-1.226576457657273, (22, 'Jory-Cassel', False)),
 (-1.2355813280328216, (7, 'Jon-Snow', False))]

We plot the graph with original labels in blue and likeliness in grey

position of nodes can be force layout

In [35]:
maxValue = max([x for x,_ in sortedPredictions])
scoresAllNodes = sortedPredictions+[(maxValue,(ID,name,b)) for (ID,name,b) in stark]
scoresAllNodes = sorted(scoresAllNodes,key=itemgetter(1))
positionForceLayout=True
#feature = 7
nbNodes = G.number_of_nodes()
normalized = sk.preprocessing.minmax_scale([x for (x,_) in scoresAllNodes])
colors = [Greys(normalized[i]) for i in range(nbNodes)]
for i in range(len(colors)):
    if scoresAllNodes[i][1] in stark:
        colors[i]=[0.2,0.8,0.8,1]
In [36]:
plt.rcParams['figure.figsize'] = [17, 10]

if positionForceLayout:
    nx.draw_networkx(G,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
else:
    nx.draw_networkx(G,pos,
                       node_color=colors,nodelist=range(nbNodes),
                       width=0.1, node_size=500,
                       arrows=False, alpha=0.8,
                       font_size=10,labels=nx.get_node_attributes(G,"name"))
sm = plt.cm.ScalarMappable(cmap=Greys, norm=plt.Normalize(vmin=0, vmax=1))
sm._A = []
plt.colorbar(sm)
plt.show()
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:522: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(edge_color) \
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:543: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if cb.is_string_like(edge_color) or len(edge_color) == 1:
/Users/cazabetremy/.local/lib/python3.6/site-packages/networkx-1.11-py3.6.egg/networkx/drawing/nx_pylab.py:724: MatplotlibDeprecationWarning: The is_string_like function was deprecated in version 2.1.
  if not cb.is_string_like(label):