Page MenuHomec4science

main.py
No OneTemporary

File Metadata

Created
Wed, Aug 28, 00:00
import numpy as np
import matplotlib.pyplot as plt
import argparse
import sys
import pandas as pd
from sklearn.metrics import mutual_info_score
def soft_moduarity(wc, dc_inv, xc, lc):
# Calculate the soft modularity as defined by equation (7)
temp = np.linalg.multi_dot([dc_inv, xc, lc])
temp_t = np.transpose(temp)
n = wc.shape[0]
qc_s = np.trace(np.linalg.multi_dot([temp_t, wc, temp])) - np.linalg.multi_dot([np.ones((1, n)), temp, temp_t, wc, np.ones((n, 1))])
return qc_s
def similarity(adj_mat, weighted, gamma=0.2):
# calculate a similarity matrix from the adjacency matrix, basically an exponential scaling
n, m = adj_mat.shape
sim = np.zeros(adj_mat.shape)
if weighted:
for i in range(n):
for j in range(m):
if adj_mat[i, j] != 0:
sim[i, j] = np.exp(-1.0/(gamma * adj_mat[i, j]))
else:
sim[i, j] = 0.0
else:
sim = adj_mat / 2.0
for i in range(n):
# populate the diagonal
sim[i, i] = 1.0
sim /= np.sum(sim)
return sim
def normalize_rows(w: np.ndarray):
# normalize matrix w row-wise
return w / np.linalg.norm(w, ord=1, axis=1, keepdims=True)
def normalize_cols(xc: np.ndarray):
# normalize matrix xc column-wise
return xc / np.linalg.norm(xc, ord=1, axis=0, keepdims=True)
def kl_divergence(a_mat, b_mat):
# Kullback-Leibler divergence for all non zero elements of a_mat and b_mat
assert a_mat.shape == b_mat.shape
n, m = a_mat.shape
res = 0.0
for i in range(n):
for j in range(m):
if a_mat[i, j] > 1e-20 and b_mat[i, j] > 1e-20:
res += a_mat[i, j] * np.log(a_mat[i, j] / b_mat[i, j]) - a_mat[i, j] + b_mat[i, j]
return res
def param_update(yc, wc, alpha):
# Update function for xc (capital X and capital L), implementation of equations (4) and (5)
n, m = yc.shape
# xc_old = np.random.rand(xc.shape[0], xc.shape[1])
xc_old = np.random.rand(n, m)
xc_old = normalize_cols(xc_old)
# lc_old = lc_new
lc_old = np.diag(np.random.rand(m))
lc_old /= np.trace(lc_old)
eps = 1e-6
max_iter = int(1e3)
gamma = np.zeros(max_iter)
for it in range(max_iter):
wc_approx = xc_old.dot(lc_old.dot(xc_old.T))
xc_new = np.zeros(xc_old.shape)
lc_new = np.zeros(lc_old.shape)
for k in range(m):
for i in range(n):
for j in range(n):
if wc[i, j] != 0: # avoid divisions by extremely small values in wc_approx
xc_new[i, k] += wc[i, j] * lc_old[k, k] * xc_old[j, k] / wc_approx[i, j]
lc_new[k, k] += wc[i, j] * xc_old[i, k] * xc_old[j, k] / wc_approx[i, j]
xc_new[i, k] *= (2 * alpha * xc_old[i, k])
xc_new[i, k] += (1 - alpha) * yc[i, k]
lc_new[k, k] *= (alpha * lc_old[k, k])
lc_new[k, k] += (1 - alpha) * sum(yc[:, k])
xc_new = normalize_cols(xc_new)
lc_new /= np.trace(lc_new)
gamma[it] = alpha * kl_divergence(wc, np.linalg.multi_dot([xc_new, lc_new, xc_new.T])) + (1 - alpha) * kl_divergence(yc, xc_new.dot(lc_new))
# print("iter: {}, gamma: {}".format(it, gamma[it]))
if it == 0:
gamma_min = gamma[it]
else:
# print("d_gamma: {}, eps: {}, res: {}".format(abs((gamma[it] - gamma_min)) / gamma_min, eps, abs((gamma[it] - gamma_min)) / gamma_min < eps))
if abs((gamma[it] - gamma_min)) / gamma_min < eps:
plt.plot(gamma[0:it])
plt.show()
print("====> no iter: {}".format(it))
return xc_res, lc_res
if gamma[it] < gamma[it - 1]:
gamma_min = gamma[it]
xc_res = xc_new
lc_res = lc_new
xc_old = xc_new
lc_old = lc_new
raise Exception('Maximum iteration number reached: {}'.format(max_iter))
def read_edge_list(filename, weighted=False):
# TODO: I think the adjacency matrices come in unordered -> almost always 50 percent cluster association.
print("Reading edge file {}".format(filename))
idmap = set()
edge_cache = {}
with open(filename) as f:
for line in f:
if weighted:
u, v, w = [int(x) for x in line.strip().split()]
else:
tmp = [int(x) for x in line.strip().split()]
u, v, w = tmp[0], tmp[1], 1.0
edge_cache[(u, v)] = w
idmap.add(u)
idmap.add(v)
idmap = list(idmap)
idmap_inv = {nid: i for i, nid in enumerate(idmap)}
n = len(idmap)
adj_mat = np.zeros((n, n))
for (u, v), w in edge_cache.items():
adj_mat[idmap_inv[u], idmap_inv[v]] = w
adj_mat += adj_mat.T
return idmap, idmap_inv, adj_mat
def alg(net_path, alpha, t_steps, n, m):
# FacetNet with fixed number of communities and individuals
xc = np.random.rand(n, m)
xc = normalize_cols(xc)
lc = np.diag(np.random.rand(m))
lc /= np.trace(lc)
for t in range(t_steps):
idmap, idmap_inv, adj_mat = read_edge_list(net_path + "/%d.edgelist" % t, weighted=False)
# TODO: this similarity calculation is experiment specific and must be done outside this function
# wc = similarity(adj_mat, weighted=True)
wc = normalize_rows(adj_mat)
xc_old = xc
lc_old = lc
xc, lc = param_update(xc, lc, wc, alpha)
yc = xc.dot(lc)
dc_inv = np.zeros(n)
for i in range(n):
dc_inv[i] = 1 / np.sum(yc[i, :])
dc_inv = np.matrix(np.diag(dc_inv))
print(dc_inv)
print(yc)
soft_comm = dc_inv.dot(yc)
print("time:", t)
print("soft_comm")
print(soft_comm)
print("community net")
print(np.linalg.multi_dot([lc, xc.T, soft_comm]))
print("evolution net")
print(np.linalg.multi_dot([lc_old, xc_old.T, soft_comm]))
print("Activity (dc)")
print(np.diag(dc_inv))
df = pd.DataFrame(data=soft_comm, columns=idmap)
df.to_csv("{}/soft_comm_nw{}".format(net_path, t))
def alg_extended(net_path, alpha, t_steps, m):
# FacetNet with for variable number of nodes, a solution for different community numbers is not implemented
clusters = []
for i in range(m):
clusters.append("cluster_{}".format(i))
for t in range(t_steps):
print("time-step:", t)
idmap, idmap_inv, adj_mat = read_edge_list(net_path + "/%d.edgelist" % t, weighted=False)
n = len(idmap)
# wc = similarity(adj_mat, weighted=False)
wc = normalize_rows(adj_mat)
if t == 0:
xc = np.random.rand(n, m)
xc = normalize_cols(xc)
lc = np.diag(np.random.rand(m))
lc /= np.trace(lc)
else:
reserved_rows = [idmap_inv0[x] for x in idmap0 if x in idmap]
num_new = len(set(idmap) - set(idmap0))
num_old = len(reserved_rows)
xc = xc[reserved_rows, :]
xc = normalize_cols(xc)
xc *= num_old / (num_old + num_new)
if num_new != 0:
xc = np.pad(xc, ((0, num_new), (0, 0)), mode='constant', constant_values=(1 / num_new, 1 / num_new))
print("Calculate network representation ...")
yc = xc.dot(lc)
xc, lc, = param_update(yc, wc, alpha)
yc = xc.dot(lc)
dc_inv = np.zeros(n)
for i in range(n):
dc_inv[i] = 1 / np.sum(yc[i, :])
dc_inv = np.matrix(np.diag(dc_inv))
soft_comm = dc_inv.dot(yc)
# print("soft_comm {}".format(soft_comm))
# print("community net {}".format(np.linalg.multi_dot([lc, xc.T, soft_comm])))
# print("evolution net {}".format(np.linalg.multi_dot([lc_old, xc_old.T, soft_comm])))
# print("Activity (dc) {}".format(np.diag(dc_inv)))
# print("yc {}".format(yc))
df = pd.DataFrame(data=soft_comm, columns=clusters)
df.insert(0, column="id", value=idmap)
df.to_csv("{}/soft_comm_alpha{}_nw{}.csv".format(net_path, alpha, t), index=False)
idmap0 = idmap
idmap_inv0 = idmap_inv
def exp1():
# Experiment with network stated in 4.1.2
t_steps = 15
from synthetic import generate_evolution_exp1
print("generating synthetic graph")
generate_evolution_exp1("test_data/", tsteps=t_steps)
print("start the algorithm")
alpha = 0.5
n, m = 128, 4
np.random.seed(0)
alg("test_data/", alpha, t_steps, n, m)
def exp2():
# Experiment with adding and removing nodes
t_steps = 15
from synthetic import generate_evolution_exp2
print("generating synthetic graph")
generate_evolution_exp2("./data/syntetic2/", tsteps=t_steps)
print("start the algorithm")
alpha = 0.5
np.random.seed(0)
alg_extended("./data/syntetic2/", alpha, t_steps, 4)
def exp3():
# Experiment with network stated in 4.1.2, adding weight
t_steps = 15
from synthetic import generate_evolution_exp3
print("generating synthetic graph")
generate_evolution_exp3("./data/syntetic3/", tsteps=t_steps)
print("start the algorithm")
alpha = 0.9
n = 128
m = 4
np.random.seed(0)
alg("./data/syntetic3/", alpha, t_steps, n, m)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("t_steps", help="number of networks", type=int)
parser.add_argument("nw_folder", help="folder holding the network data", type=str)
parser.add_argument("alpha", help="alpha constant from gamma = CS * alpha + CT * (1 - alpha)", type=float)
parser.add_argument("m_cluster", help="number (m) of clusters", type=int)
args = parser.parse_args()
if args.alpha < 0.0 or args.alpha > 1.0:
print("Alpha value must be in [0.0, 1.0], terminate script")
sys.exit(1)
print("Analyzing networks from: {}".format(args.nw_folder))
print("# time-steps: {}, alpha: {}, # clusters: {}".format(args.t_steps, args.alpha, args.m_cluster))
alg_extended(args.nw_folder, args.alpha, args.t_steps, args.m_cluster)
print("soft_communities written to: {}".format(args.nw_folder))
# path = "/home/matthias/Documents/CodeDataRaph/data_Danielle/24h_networks/col4"
# exp_raph()
# print("Experiment with network stated in 4.1.2")
# exp1()
# print("Experiment with adding and removing nodes")
# exp2()
# print("Experiment with network stated in 4.1.2, adding weight")
# exp3()

Event Timeline