import random
import numpy as np
import networkx as nx
import seaborn as sns
import pickle
%matplotlib inline
dirTemp = "temp/" #directory for manipulating graphs and embedding
G = pickle.load( open( dirTemp+"pickledGraph.p", "rb" ) )
def generate_walks(d_graph, walk_length, num_walks,probabilities_key=None):
"""
Generates the random walks which will be used as the skip-gram input.
:return: List of walks. Each walk is a list of nodes.
"""
walks = list()
# ------------for the number of walks we want to run-------------
for n_walk in range(num_walks):
# Shuffle the nodes
shuffled_nodes = list(d_graph.nodes())
random.shuffle(shuffled_nodes)
# ------------Start a random walk from every node------------
for source in shuffled_nodes:
# Start walk
walk = [source]
# ------------Perform walk------------
while len(walk) < walk_length:
# ------------Get the neighbors------------
#walk_options = G.neighbors(walk[-1])
walk_options = G[walk[-1]]
nodes = [n for n in walk_options]
probas = [v["weight"] for v in walk_options.values()]
prob_factor = 1 / sum(probas)
probas = [prob_factor * p for p in probas]
# Skip dead end nodes
if len(walk_options)==0:
break
# ------------Choose randomly a neighbor------------
# For the first step
walk_to = np.random.choice(nodes, size=1,p=probas)[0]
walk.append(walk_to)
# walk = list(map(str, walk)) # Convert all to strings
walks.append(walk)
return walks
walks = generate_walks(G,walk_length=80,num_walks=10)
Of course, the right solution is to use one of the many efficient implementations. for instance, one can use :
embedding = gensim.models.Word2Vec(self.walks, **skip_gram_params)
But here, we will look at a basic, simple matrix implementation
It is taken (simplified) from:
https://github.com/cbellei/word2veclite/blob/master/word2veclite/word2veclite.py
# The main method that does the parameter learning
def cbow(context, label, W1, W2, loss):
"""
Implementation of Continuous-Bag-of-Words Word2Vec model
:param context: all the context words (these represent the inputs)
:param label: the center word (this represents the label)
:param W1: weights from the input to the hidden layer
:param W2: weights from the hidden to the output layer
:param loss: float that represents the current value of the loss function
:return: updated weights and loss
"""
# context is 'x' from tokenizer, it is a c x V matrix
# label is 'y' from tokenizer, it is a 1 x V matrix
x = np.matrix(np.mean(context, axis=0))
# x is a 1 x V matrix
# W1 is a VxN matrix
# h is a N x 1 matrix
h = np.matmul(W1.T, x.T)
# u is a V x 1 matrix
u = np.matmul(W2.T, h)
# W2 is an N x V matrix
# y_pred is a V x 1 matrix
y_pred = softmax(u)
# e is a V x 1 matrix
e = -label.T + y_pred
# h is N x 1 and e is V x 1 so dW2 is N x V
dW2 = np.outer(h, e)
# x.T is a V x 1 matrix, W2e is a Nx1 so dW1 this is V x N
dW1 = np.outer(x.T, np.matmul(W2, e))
new_W1 = W1 - eta * dW1
new_W2 = W2 - eta * dW2
# label is a 1xV matrix so label.T is a Vx1 matrix
loss += -float(u[label.T == 1]) + np.log(np.sum(np.exp(u)))
return new_W1, new_W2, loss
#A utility function to compute the "softmax"
def softmax(x):
"""Calculate softmax based probability for given input vector
# Arguments
x: numpy array/list
# Returns
softmax of input array
"""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
#A utility function to compute "contexts" and "central words" from "sentances"
def corpus2io(corpus_tokenized, nbNodes, window_size):
"""Converts corpus text into context and center words
# Arguments
corpus_tokenized: corpus text
window_size: size of context window
# Returns
context and center words (arrays)
"""
#for each "sentance" in the corpus
for words in corpus_tokenized:
#how many words in the sentance
L = len(words)
#
for index, word in enumerate(words):
contexts = []
center = -1
#s : index of first element of the context
s = index - window_size
#e : index of last element of the context
e = index + window_size + 1
#extract words in the context
contexts = contexts + [words[i] for i in range(s, e) if 0 <= i < L and i != index]
center = word ##If I understand correctly, -1 earlier was because we considered that words start at 1
# x has shape c x V where c is size of contexts
###x = np_utils.to_categorical(contexts, V)
x = np.zeros((len(contexts),nbNodes))
for i in range(len(contexts)):
x[i][contexts[i]]=1
# y has shape k x V where k is number of center words
y=np.zeros((1,nbNodes))#[0]*nbNodes
y[0][center]=1
###y = np_utils.to_categorical(center, V)
yield (x, y)
def initialize(V, N):
"""
Initialize the weights of the neural network.
:param V: size of the vocabulary
:param N: size of the hidden layer
:return: weights W1, W2
"""
np.random.seed(100)
W1 = np.random.rand(V, N)
W2 = np.random.rand(N, V)
return W1, W2
Takes some "corpus" of "sentences" (for us, random walks) and launch the learning.
nbNodes = G.number_of_nodes()
#print(G.nodes())
eta = 0.1 #learning_rate
n_epochs = 8
windowSize=3
W1,W2 = initialize(nbNodes,8) #nb nodes and dimensions of the embedding
loss_vs_epoch = []
for e in range(n_epochs):
loss = 0.
print(W1[0])
print("epoch",e)
for (context, center) in corpus2io(walks,nbNodes,windowSize):
(W1, W2, loss) = cbow(context,center,W1,W2,loss)
loss_vs_epoch.append(loss)
#plotting the learning procedure
sns.tsplot(loss_vs_epoch)
dirTemp = "temp/" #directory for manipulating graphs and embedding
shape = str(W1.shape[0])+" "+str(W1.shape[1])
toPrint = np.c_[ np.arange(0,nbNodes), W1 ]
np.savetxt(dirTemp+"custom.emb", toPrint, delimiter=" ",header=shape,comments="")