diff --git a/facetnet.py b/facetnet.py index 2176996..85dcca3 100644 --- a/facetnet.py +++ b/facetnet.py @@ -1,283 +1,296 @@ import numpy as np import matplotlib.pyplot as plt import pandas as pd # from sklearn.metrics import mutual_info_score def soft_moduarity(wc, dc_inv, xc, lc): # Calculate the soft modularity as defined by equation (7) temp = np.linalg.multi_dot([dc_inv, xc, lc]) - temp_t = np.transpose(temp) n = wc.shape[0] - qc_s = np.trace(np.linalg.multi_dot([temp_t, wc, temp])) - np.linalg.multi_dot([np.ones((1, n)), temp, temp_t, wc, np.ones((n, 1))]) + # qc_s = np.trace(np.linalg.multi_dot([temp.T, wc, temp])) - np.linalg.multi_dot([np.ones((1, n)), wc.T, temp, temp.T, wc, np.ones((n, 1))]) + qc_s = np.linalg.multi_dot([np.ones((1, n)), wc.T, temp, temp.T, wc, np.ones((n, 1))]) + return qc_s def similarity(adj_mat, weighted, gamma=0.2): # calculate a similarity matrix from the adjacency matrix, basically an exponential scaling n, m = adj_mat.shape sim = np.zeros(adj_mat.shape) if weighted: for i in range(n): for j in range(m): if adj_mat[i, j] != 0: sim[i, j] = np.exp(-1.0/(gamma * adj_mat[i, j])) else: sim[i, j] = 0.0 else: sim = adj_mat / 2.0 for i in range(n): # populate the diagonal sim[i, i] = 1.0 sim /= np.sum(sim) return sim def normalize_rows(w: np.ndarray): # normalize matrix w row-wise row_norm = np.linalg.norm(w, ord=1, axis=1) for i in range(w.shape[0]): if row_norm[i] != 0: w[i, :] /= row_norm[i] return w def normalize_cols(xc: np.ndarray): # normalize matrix xc column-wise col_norm = np.linalg.norm(xc, ord=1, axis=0) for j in range(xc.shape[1]): if col_norm[j] != 0: xc[:, j] /= col_norm[j] return xc def kl_divergence(a_mat, b_mat): # Kullback-Leibler divergence for all non zero elements of a_mat and b_mat assert a_mat.shape == b_mat.shape n, m = a_mat.shape res = 0.0 for i in range(n): for j in range(m): if a_mat[i, j] > 1e-20 and b_mat[i, j] > 1e-20: res += a_mat[i, j] * np.log(a_mat[i, j] / b_mat[i, j]) - a_mat[i, j] + b_mat[i, j] return res def param_update(yc, wc, alpha, is_plot=False): # Update function for xc (capital X and capital L), implementation of equations (4) and (5) n, m = yc.shape # xc_old = np.random.rand(xc.shape[0], xc.shape[1]) xc_old = np.random.rand(n, m) xc_old = normalize_cols(xc_old) # lc_old = lc_new lc_old = np.diag(np.random.rand(m)) lc_old /= np.trace(lc_old) eps = 1e-5 max_iter = int(1e3) gamma = np.zeros(max_iter) for it in range(max_iter): wc_approx = xc_old.dot(lc_old.dot(xc_old.T)) xc_new = np.zeros(xc_old.shape) lc_new = np.zeros(lc_old.shape) for k in range(m): for i in range(n): for j in range(n): if wc[i, j] != 0: # avoid divisions by zero in wc_approx xc_new[i, k] += wc[i, j] * lc_old[k, k] * xc_old[j, k] / wc_approx[i, j] lc_new[k, k] += wc[i, j] * xc_old[i, k] * xc_old[j, k] / wc_approx[i, j] xc_new[i, k] *= (2 * alpha * xc_old[i, k]) xc_new[i, k] += (1 - alpha) * yc[i, k] lc_new[k, k] *= (alpha * lc_old[k, k]) lc_new[k, k] += (1 - alpha) * sum(yc[:, k]) xc_new = normalize_cols(xc_new) lc_new /= np.trace(lc_new) gamma[it] = alpha * kl_divergence(wc, np.linalg.multi_dot([xc_new, lc_new, xc_new.T])) + (1 - alpha) * kl_divergence(yc, xc_new.dot(lc_new)) if it == 0: gamma_min = gamma[it] else: if abs((gamma[it] - gamma_min)) / gamma_min < eps: if is_plot: plt.plot(gamma[0:it]) plt.ylabel(r"$\gamma$") plt.xlabel("# iter") plt.show() print("====> no iter: {}".format(it)) return xc_res, lc_res if gamma[it] < gamma[it - 1]: gamma_min = gamma[it] xc_res = xc_new lc_res = lc_new else: if is_plot: plt.plot(gamma[0:it]) plt.ylabel(r"$\gamma$") plt.xlabel("# iter") plt.show() print("====> no iter: {}".format(it)) return xc_res, lc_res xc_old = xc_new lc_old = lc_new raise Exception('Maximum iteration number reached: {}'.format(max_iter)) def read_edge_list(filename, weighted=False): print("Reading edge file {}".format(filename)) idmap = set() edge_cache = {} with open(filename) as f: for line in f: if weighted: u, v, w = [int(x) for x in line.strip().split()] else: tmp = [int(x) for x in line.strip().split()] u, v, w = tmp[0], tmp[1], 1.0 edge_cache[(u, v)] = w idmap.add(u) idmap.add(v) idmap = list(idmap) idmap_inv = {nid: i for i, nid in enumerate(idmap)} n = len(idmap) adj_mat = np.zeros((n, n)) for (u, v), w in edge_cache.items(): adj_mat[idmap_inv[u], idmap_inv[v]] = w adj_mat += adj_mat.T return idmap, idmap_inv, adj_mat def facetnet_evolution(net_path, alpha, t_steps, m, show_plot=False): """ Applies facetNet algorithm on an entire set of consecutively named edgelist.X files. Can deal with appearing and disappearing number of nodes. Establishes soft community memberships to an a priori given number of communities (m).. :param net_path: Path to edge-lists :param alpha: Alpha cost weight :param t_steps: Total number of time-steps to be computed (must be at least # of edge-list files) :param m: Number of (a priori) communities :param show_plot: Bool, show convergence plot of gamma :return: No return value, saves soft_comm as soft_comm_alpha0.X_nwX.csv files in net_path folder """ clusters = [] for i in range(m): clusters.append("cluster_{}".format(i)) for t in range(t_steps): print("time-step:", t) idmap, idmap_inv, adj_mat = read_edge_list(net_path + "/%d.edgelist" % t, weighted=True) n = len(idmap) wc = normalize_rows(adj_mat) print("Calculate network representation ...") if t == 0: xc = np.random.rand(n, m) xc = normalize_cols(xc) lc = np.diag(np.random.rand(m)) lc /= np.trace(lc) yc = xc.dot(lc) xc, lc, = param_update(yc, wc, 1.0, show_plot) else: reserved_rows = [idmap_inv0[x] for x in idmap0 if x in idmap] num_new = len(set(idmap) - set(idmap0)) num_old = len(reserved_rows) xc = xc[reserved_rows, :] xc = normalize_cols(xc) xc *= num_old / (num_old + num_new) if num_new != 0: xc = np.pad(xc, ((0, num_new), (0, 0)), mode='constant', constant_values=(1 / num_new, 1 / num_new)) yc = xc.dot(lc) xc, lc, = param_update(yc, wc, alpha, show_plot) yc = xc.dot(lc) dc_inv = np.zeros(n) for i in range(n): dc_inv[i] = 1 / np.sum(yc[i, :]) dc_inv = np.matrix(np.diag(dc_inv)) soft_comm = dc_inv.dot(yc) df = pd.DataFrame(data=soft_comm, columns=clusters) df.insert(0, column="id", value=idmap) df.to_csv("{}/soft_comm_alpha{}_nw{}.csv".format(net_path, alpha, t), index=False) idmap0 = idmap idmap_inv0 = idmap_inv def facetnet_step(edgelist_path, alpha, m, show_plot=False, xc_prev=None, lc_prev=None, idmap0=None, idmap_inv0=None): """ Applies one step of the facetNet algorithm, equivalent to a single step of facetnet_evolution :param xc_prev: X(t-1) matrix :param lc_prev: LAMBDA(t-1) matrix :param idmap0: previous id map :param idmap_inv0: inverse previous idmap :param edgelist_path: Path to current edgelist :param alpha: Alpha cost weight :param m: Number of (a priori) communities :param show_plot: Bool, show convergence plot of gamma :return: idmap, idmap_inv, xc, lc, soft_comm_df """ clusters = [] for i in range(m): clusters.append("cluster_{}".format(i)) idmap, idmap_inv, adj_mat = read_edge_list(edgelist_path, weighted=True) n = len(idmap) wc = normalize_rows(adj_mat) print("Calculate network representation ...") if xc_prev is None: print("This is an initilaizing step (t = 0)") xc = np.random.rand(n, m) xc = normalize_cols(xc) lc = np.diag(np.random.rand(m)) lc /= np.trace(lc) yc = xc.dot(lc) xc, lc, = param_update(yc, wc, 1.0, show_plot) else: reserved_rows = [idmap_inv0[x] for x in idmap0 if x in idmap] num_new = len(set(idmap) - set(idmap0)) num_old = len(reserved_rows) xc = xc_prev[reserved_rows, :] xc = normalize_cols(xc) xc *= num_old / (num_old + num_new) if num_new != 0: xc = np.pad(xc, ((0, num_new), (0, 0)), mode='constant', constant_values=(1 / num_new, 1 / num_new)) yc = xc.dot(lc_prev) xc, lc, = param_update(yc, wc, alpha, show_plot) yc = xc.dot(lc) dc_inv = np.zeros(n) for i in range(n): dc_inv[i] = 1 / np.sum(yc[i, :]) dc_inv = np.matrix(np.diag(dc_inv)) soft_comm = dc_inv.dot(yc) + comm_net = np.linalg.multi_dot([lc, xc.T, soft_comm]) + if xc_prev is None: + evol_net = None + else: + evol_net = np.linalg.multi_dot([lc_prev, xc_prev.T, soft_comm]) + + qc_s = soft_moduarity(wc, dc_inv, xc, lc) + soft_comm_df = pd.DataFrame(data=soft_comm, columns=clusters) soft_comm_df.insert(0, column="id", value=idmap) - return idmap, idmap_inv, xc, lc, soft_comm_df + comm_net_df = pd.DataFrame(data=comm_net, columns=clusters) + + evol_net_df = pd.DataFrame(data=evol_net, columns=clusters) + + return idmap, idmap_inv, xc, lc, qc_s, soft_comm_df, comm_net_df, evol_net_df diff --git a/facetnet_step.py b/facetnet_step.py index 06375be..c1541eb 100644 --- a/facetnet_step.py +++ b/facetnet_step.py @@ -1,54 +1,60 @@ import argparse import sys import numpy as np import facetnet as fn if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("edgelist", help="full path to edge-list file", type=str) parser.add_argument("alpha", help="alpha cost weight (0.0, 1.0] for gamma = CS*alpha + CT*(1-alpha)", type=float) parser.add_argument("m_cluster", help="number of clusters", type=int) parser.add_argument("res_folder", help="path to results folder, where" "xcap.dat, lcap.dat, idmap.dat, idmap_inv.dat, soft_comm.csv" "will be saved", type=str) parser.add_argument("t_step", help="network time step, to save result", type=int) parser.add_argument("--xcap", help="full path to xcap.dat file", type=str) parser.add_argument("--lcap", help="full path to lcap.dat file", type=str) parser.add_argument("--idmap", help="full path to idmap.dat file", type=str) parser.add_argument("--idmap_inv", help="full path to idmap_inv.dat file", type=str) parser.add_argument("--show", help="show convergence plot of gamma", type=str) args = parser.parse_args() if args.alpha <= 0.0 or args.alpha > 1.0: print("Alpha value must be in (0.0, 1.0], terminate script") sys.exit(1) print("Analyzing network: {}".format(args.edgelist)) print("# time-step: {}, alpha: {}, # clusters: {}".format(args.t_step, args.alpha, args.m_cluster)) if args.show is not None: is_show = True else: is_show = False if args.xcap is None and args.lcap is None and args.idmap is None and args.idmap_inv is None: - idmap, idmap_inv, xc, lc, soft_comm_df = fn.facetnet_step(args.edgelist, args.alpha, args.m_cluster, is_show) + idmap, idmap_inv, xc, lc, qc_s, soft_comm_df, comm_net_df, evol_net_df = fn.facetnet_step( + args.edgelist, args.alpha, args.m_cluster, is_show) elif args.xcap is not None and args.lcap is not None and args.idmap is not None and args.idmap_inv is not None: xc = np.load(args.xcap, allow_pickle=True) lc = np.load(args.lcap, allow_pickle=True) idmap = np.ndarray.tolist(np.load(args.idmap, allow_pickle=True)) idmap_inv = np.ndarray.tolist(np.load(args.idmap_inv, allow_pickle=True)) - idmap, idmap_inv, xc, lc, soft_comm_df = fn.facetnet_step(args.edgelist, args.alpha, args.m_cluster, is_show, xc, lc, idmap, idmap_inv) + idmap, idmap_inv, xc, lc, qc_s, soft_comm_df, comm_net_df, evol_net_df = fn.facetnet_step( + args.edgelist, args.alpha, args.m_cluster, is_show, xc, lc, idmap, idmap_inv) + evol_net_df.to_csv("{}/evol_net_step_alpha{}_nw{}.csv".format(args.res_folder, args.alpha, args.t_step), index=False) else: print("Some of (xcap, ycap, idmap, idmap_inv) are given, some not") sys.exit(1) soft_comm_df.to_csv("{}/soft_comm_step_alpha{}_nw{}.csv".format(args.res_folder, args.alpha, args.t_step), index=False) + comm_net_df.to_csv("{}/comm_net_step_alpha{}_nw{}.csv".format(args.res_folder, args.alpha, args.t_step), index=False) + xc.dump("{}/xcap_alpha{}_nw{}.dat".format(args.res_folder, args.alpha, args.t_step)) lc.dump("{}/lcap_alpha{}_nw{}.dat".format(args.res_folder, args.alpha, args.t_step)) np.asarray(idmap).dump("{}/idmap_alpha{}_nw{}.dat".format(args.res_folder, args.alpha, args.t_step)) np.asarray(idmap_inv).dump("{}/idmap_inv_alpha{}_nw{}.dat".format(args.res_folder, args.alpha, args.t_step)) + print("Soft modularity Qc={}".format(qc_s)) print("Results written to: {}".format(args.res_folder))