%load_ext autoreload
from importlib import reload
import matplotlib.pyplot as plt
from matplotlib.cm import viridis
from matplotlib.cm import Greys
%matplotlib inline
import numpy as np
from sklearn.manifold import TSNE
from sklearn.metrics import pairwise
import sklearn as sk
import community
import seaborn as sns
import networkx as nx
import pandas as pd
import subprocess
import sys
import pickle
from operator import itemgetter
dirTemp = "temp/" #directory for manipulating graphs and embedding
#embeddingFile = "temp/node2vec.emb"
#embeddingFile = "temp/struc2vec.emb"
embeddingFile = "temp/custom.emb"
G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )
def loadEmbedding(file_name):
with open(file_name, 'r') as f:
n, d = f.readline().strip().split()
X = np.zeros((int(n), int(d)))
for line in f:
emb = line.strip().split()
emb_fl = [float(emb_i) for emb_i in emb[1:]]
X[int(float(emb[0])), :] = emb_fl
return X
theEmbedding = loadEmbedding(embeddingFile)
1)Compute the distance between nodes in the embedding
2)The shorter this distance, the most propable a link
embDist = pairwise.cosine_distances(theEmbedding)
#embDist = pairwise.euclidean_distances(theEmbedding)
embDist = {frozenset([i,j]):embDist[i][j] for i in range(len(embDist)) for j in range(len(embDist)) if i!=j}
edges = G.edges()
print("nb edges: ",len(edges))
edges = {frozenset(e) for e in edges}
print("pairs of nodes: ",len(embDist))
missingEdgesScore = {k:v for k,v in embDist.items() if not k in edges}
print("potential edges: ",len(missingEdgesScore))
names= nx.get_node_attributes(G,"name")
sortedPredictions = sorted(((value, key) for (key,value) in missingEdgesScore.items()),reverse=False)
sortedPredictions = [(score,IDs,(names[list(IDs)[0]],names[list(IDs)[1]])) for score,IDs in sortedPredictions]
sortedPredictions[:10]
selectedEdgesUnsupervised = [e for _,e,_ in sortedPredictions[:10]]
1)Combine node embeddings to obtain pair-of-nodes embeddings
def hadamard1(vi, vj):
prod=np.multiply(vi, vj)
return prod
def hadamard2(vi, vj):
prod=np.multiply(vi, vj)/(np.linalg.norm(vi)*np.linalg.norm(vj))
return prod
#combining the embeddings
#combinedEmbedding = dict()
edgesEmbeddings = {}
nonEdgesEmbeddings = {}
for i in range(len(theEmbedding)):
for j in range(i+1,len(theEmbedding)):
if i!=j:
combinedEmb = hadamard2(theEmbedding[i],theEmbedding[j])
if G.has_edge(i,j):
edgesEmbeddings[frozenset([i,j])]=combinedEmb
else:
nonEdgesEmbeddings[frozenset([i,j])]=combinedEmb
2)Train a logistic classifier from available positive examples and a sample of negative examples
trainingSet = [(k,edgesEmbeddings[k],True) for k in edgesEmbeddings.keys()]
print("positive examples: ",len(trainingSet))
randomKeys = np.random.choice(list(nonEdgesEmbeddings.keys()),len(edgesEmbeddings),replace=False)
trainingSet += [(k,nonEdgesEmbeddings[k],False) for k in randomKeys]
print("all examples: ",len(trainingSet))
logistic = sk.linear_model.LogisticRegression()
logistic.fit([x for (_,x,l) in trainingSet],[l for (_,x,l) in trainingSet])
nonEdgesList = list(nonEdgesEmbeddings.keys())
scores = logistic.decision_function([nonEdgesEmbeddings[k] for k in nonEdgesList])
sortedPredictions = sorted([(scores[i],nonEdgesList[i])for i in range(len(scores))],reverse=True)
names= nx.get_node_attributes(G,"name")
sortedPredictions = [(score,IDs,(names[list(IDs)[0]],names[list(IDs)[1]])) for score,IDs in sortedPredictions]
selectedEdgesSupervised = [e for _,e,_ in sortedPredictions[:10]]
sortedPredictions[:10]
selectedEdges = selectedEdgesUnsupervised
#selectedEdges = selectedEdgesSupervised
tempGraph = G.copy()
tempGraph.add_edges_from(selectedEdges)
colors = []
for e in tempGraph.edges():
if frozenset(e) in selectedEdges:
colors.append("blue")
else:
colors.append("grey")
plt.rcParams['figure.figsize'] = [17, 10]
nx.draw_networkx(tempGraph, edge_color=colors)
Often, datasets have "labels" on nodes, properties that characterize them.
In this toy dataset, we can use the "family name" of characters as an attribute.
The goal would be to predict which characters would be the most likely to also be part of the family.
We start by identifying characters of a class
names = nx.get_node_attributes(G,"name")
stark = [(k,n,True) for k,n in names.items() if "Stark" in n]
stark
An plot the most likely candidates
nonStark = [(k,v,False) for k,v in names.items() if k not in [v for v,_,_ in stark]]
logistic = sk.linear_model.LogisticRegression()
logistic.fit([theEmbedding[i] for i,_,_ in stark+nonStark],[l for _,_,l in stark+nonStark])
scores = logistic.decision_function([theEmbedding[i] for (i,_,_) in nonStark])
sortedPredictions = sorted([(scores[i],nonStark[i])for i in range(len(scores))],reverse=True)
sortedPredictions[:10]
position of nodes can be force layout
maxValue = max([x for x,_ in sortedPredictions])
scoresAllNodes = sortedPredictions+[(maxValue,(ID,name,b)) for (ID,name,b) in stark]
scoresAllNodes = sorted(scoresAllNodes,key=itemgetter(1))
positionForceLayout=True
#feature = 7
nbNodes = G.number_of_nodes()
normalized = sk.preprocessing.minmax_scale([x for (x,_) in scoresAllNodes])
colors = [Greys(normalized[i]) for i in range(nbNodes)]
for i in range(len(colors)):
if scoresAllNodes[i][1] in stark:
colors[i]=[0.2,0.8,0.8,1]
plt.rcParams['figure.figsize'] = [17, 10]
if positionForceLayout:
nx.draw_networkx(G,
node_color=colors,nodelist=range(nbNodes),
width=0.1, node_size=500,
arrows=False, alpha=0.8,
font_size=10,labels=nx.get_node_attributes(G,"name"))
else:
nx.draw_networkx(G,pos,
node_color=colors,nodelist=range(nbNodes),
width=0.1, node_size=500,
arrows=False, alpha=0.8,
font_size=10,labels=nx.get_node_attributes(G,"name"))
sm = plt.cm.ScalarMappable(cmap=Greys, norm=plt.Normalize(vmin=0, vmax=1))
sm._A = []
plt.colorbar(sm)
plt.show()