In [1]:
import pandas as pd
import networkx as nx
from sklearn.metrics import roc_auc_score

In [2]:
day1 = pd.read_parquet("part-00156-0d84e01b-458f-4875-854c-cab58ad5aaf5.c000.snappy.parquet")

In [3]:
tr = day1.groupby(["src_identity","dst_identity"]).sum().reset_index()

In [4]:
def net_from_tr(df,threashold_value=100000000,threshold_repetitions=5):
    tr = df.groupby(["src_identity","dst_identity"]).sum().reset_index()
    repet_dst = tr['dst_identity'].value_counts()
    repet_src = tr['src_identity'].value_counts()
    tr = tr[tr['dst_identity'].isin(repet_dst[repet_dst>=threshold_repetitions].index)]
    tr = tr[tr['src_identity'].isin(repet_src[repet_src>=threshold_repetitions].index)]


    tr_noloop=tr[tr["src_identity"]!=tr["dst_identity"]].sort_values("value")
    tr_large = tr_noloop[tr_noloop["value"]>threashold_value]
    large_g = nx.from_pandas_edgelist(tr_large,source="src_identity",target="dst_identity",edge_attr="value")
    return tr_large,large_g

In [5]:
day1 = pd.read_parquet("part-00156-0d84e01b-458f-4875-854c-cab58ad5aaf5.c000.snappy.parquet")

In [6]:
df1,g1 = net_from_tr(day1)

In [7]:
len(g1.nodes)

459

In [8]:
nx.write_graphml(g1,"test2.graphml")

In [9]:
predictions_on = list(g1.nodes)

In [10]:
day2 = pd.read_parquet("part-00163-0d84e01b-458f-4875-854c-cab58ad5aaf5.c000.snappy.parquet")

In [11]:
df2,g2 = net_from_tr(day2)

In [12]:
PA = nx.preferential_attachment(g1)
AA = nx.adamic_adar_index(g1)
prediction = pd.DataFrame.from_records(list(AA),columns=["n1","n2","score"])
#pouet = pd.DataFrame.from_records(list(PA),columns=["n1","n2","score"])

In [13]:
scores=[]
classe=[]
pairs=[]
for i,row in prediction.iterrows():
    u = row["n1"]
    v=row["n2"]
    if u in g2 and v in g2 and not g1.has_edge(u,v):
        if g2.has_edge(u,v):
            classe.append(1)
        else:
            classe.append(0)
        scores.append(row["score"])
        pairs.append((u,v))

In [14]:
roc_auc_score(classe,scores)

0.7506603285424116

In [15]:
test =pd.DataFrame()

In [16]:
test["scores"]=scores
test["classe"]=classe
test["pairs"]=pairs

In [17]:
test.sort_values("scores",ascending=False)[:50]

Unnamed: 0,scores,classe,pairs
7501,4.540307,0,"(0, Bitstamp.net-old)"
11326,4.422833,1,"(AgoraMarket, Bitstamp.net-old)"
9001,4.418558,1,"(ePay.info, Bitstamp.net-old)"
3034,4.259891,1,"(Cryptsy.com, Bitfinex.com-old2)"
2922,4.171256,0,"(Cryptsy.com, AgoraMarket)"
3912,3.840217,0,"(2226, ePay.info)"
10160,3.776366,0,"(Bitfinex.com-old2, AgoraMarket)"
8945,3.709215,0,"(ePay.info, Poloniex.com)"
10162,3.566595,1,"(Bitfinex.com-old2, OKCoin.com-2)"
5464,3.564195,1,"(Bittrex.com, Huobi.com-2)"


In [18]:
import karateclub
from karateclub import DeepWalk,Role2Vec

In [35]:
g_CC=g1.subgraph(max(nx.connected_components(g1), key=len)).copy()

In [36]:
model= DeepWalk(dimensions=8,window_size=4)
#model= Role2Vec(dimensions=8,window_size=4)

In [37]:
new_names = {n:i for i,n in enumerate(g_CC.nodes)}
inverse = {num:name for name,num in new_names.items()}
g_CC = nx.relabel_nodes(g_CC,new_names)

In [38]:
model.fit(g_CC)

In [39]:
X = model.get_embedding()

In [40]:
new_names["OKCoin.com"]

297

In [41]:
from scipy import spatial
node = []
val=[]
for i in range(len(X)):
    v = spatial.distance.cosine(X[i], X[297])
    node.append(inverse[i])
    val.append(v)

In [42]:
display = pd.DataFrame()
display["node"]=node
display["val"]=val

In [43]:
display.sort_values("val",ascending=True)[:20]

Unnamed: 0,node,val
297,OKCoin.com,0.0
370,9834729,0.031355
70,424,0.100904
137,BtcTrade.com,0.111413
107,1647,0.132239
142,4394,0.132872
181,89891,0.149563
373,1659009,0.153569
106,1459726,0.200141
110,99582,0.20189


In [44]:
display.sort_values("val",ascending=False)[:20]

Unnamed: 0,node,val
206,17yMoeo9hxaJLMrA7XzhGHkmVAZEQ864bw,1.609273
214,CoinArch.com,1.603134
77,655983,1.582718
218,SatoshiMines.com,1.516015
310,1GrwqBWKe9bhR1XY8nDtZ16guHmypWVQjB,1.513603
61,EvolutionMarket,1.510991
335,2696191,1.508463
101,2268,1.506685
308,999Dice.com,1.505978
177,69198728,1.480292
