"""
Modified by Matthias Rüegg (matthias.ruegg@unil.ch) on August 13 2019
Based on a program created by created by github.com/blmoistawinde (https://github.com/blmoistawinde/facetnet-python)
Copyright 2019 __UNIL__. All rights reserved.

This file is part of facetnet-python-unil.

    facetnet-python-unil is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    facetnet-python-unil is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with facetnet-python-unil. If not, see <https://www.gnu.org/licenses/>.
"""

import numpy as np
import matplotlib.pyplot as plt
# from sklearn.metrics import mutual_info_score


def soft_modularity(wc, dc_inv, xc, lc):
    # Calculate the soft modularity as defined by equation (7)

    temp = np.linalg.multi_dot([dc_inv, xc, lc])
    n = wc.shape[0]
    qc_s = np.trace(np.linalg.multi_dot([temp.T, wc, temp])) - np.linalg.multi_dot([np.ones((1, n)), wc.T, temp, temp.T, wc, np.ones((n, 1))])

    return qc_s[0, 0]


def soft_modularity_alt(soft_comm, W):
    # from the original facetnet-python project

    N = W.shape[0]
    ret = np.trace(soft_comm.T*W*soft_comm)
    one = np.matrix(np.ones((N,1)))
    ret -= np.array(one.T*W.T*soft_comm*soft_comm.T*W*one).squeeze()
    return ret


def similarity(adj_mat, weighted, gamma=0.2):
    # calculate a similarity matrix from the adjacency matrix, basically an exponential scaling (see paper sec 4.2)

    n, m = adj_mat.shape
    sim = np.zeros(adj_mat.shape)
    if weighted:
        for i in range(n):
            for j in range(m):
                if adj_mat[i, j] != 0:
                    sim[i, j] = np.exp(-1.0/(gamma * adj_mat[i, j]))
                else:
                    sim[i, j] = 0.0
    else:
        sim = adj_mat / 2.0
    for i in range(n):
        sim[i, i] = 1.0
    sim /= np.sum(sim)
    return sim


def normalize_rows(w: np.ndarray):
    # normalize matrix w row-wise

    row_norm = np.linalg.norm(w, ord=1, axis=1)
    for i in range(w.shape[0]):
        if row_norm[i] != 0:
            w[i, :] /= row_norm[i]
    return w


def normalize_cols(xc: np.ndarray):
    # normalize matrix xc column-wise

    col_norm = np.linalg.norm(xc, ord=1, axis=0)
    for j in range(xc.shape[1]):
        if col_norm[j] != 0:
            xc[:, j] /= col_norm[j]
    return xc


def kl_divergence(a_mat, b_mat):
    # Kullback-Leibler divergence for all non zero elements of a_mat and b_mat

    assert a_mat.shape == b_mat.shape

    n, m = a_mat.shape
    res = 0.0
    for i in range(n):
        for j in range(m):
            if a_mat[i, j] > 1e-20 and b_mat[i, j] > 1e-20:
                res += a_mat[i, j] * np.log(a_mat[i, j] / b_mat[i, j]) - a_mat[i, j] + b_mat[i, j]
    return res


def dc_inverse(yc, n):
    # dc[i, i] = sum_j{[xc * lc]_ij}, see paragraph 2.5.1

    dc_inv = np.zeros(n)
    for i in range(n):
        dc_inv[i] = 1 / np.sum(yc[i, :])
    dc_inv = np.matrix(np.diag(dc_inv))
    return dc_inv


def initialize_cover(n, m):
    # Initializing for matrices xc and lc at time step t = 0

    print("Initialization step (t = 0)")
    xc = np.random.rand(n, m)
    xc = normalize_cols(xc)
    lc = np.diag(np.random.rand(m))
    lc /= np.trace(lc)
    yc = xc.dot(lc)
    return yc


def compensate_node_demography(xc, idmap0, idmap, idmap_inv0):
    # Account for demographic changes in the nodes list (see paper paragraph 3.1)

    reserved_rows = [idmap_inv0[x] for x in idmap0 if x in idmap]
    num_new = len(set(idmap) - set(idmap0))
    num_old = len(reserved_rows)

    xc = xc[reserved_rows, :]
    xc = normalize_cols(xc)
    xc *= num_old / (num_old + num_new)
    if num_new != 0:
        xc = np.pad(xc, ((0, num_new), (0, 0)), mode='constant', constant_values=(0.0, 0.0))
    return xc


def update_cover(yc, wc, alpha, is_plot=False):
    """
    Update function for xc (capital X and capital L) with static community number.
    Implementation of equations (4) and (5). If alpha = 1.0 (i.e. ignoring yc when calculating the community
    representation), yc is only used to determine (n, m) but its values are ignored (yc is to b filled with NaN).

    :param yc: xc(t-1) * lc(t-1), see paragraph 2.2.2
    :param wc: adjacency matrix, needs to be normalized with total matrix sum, see paragraph 4.2
    :param alpha: snap-shot vs. temporal cost weight, see paragraph 2.2
    :param is_plot: if True, a convergence plot is shown
    :return: xc_res, lc_res
    """

    n, m = yc.shape

    xc_old = np.random.rand(n, m)
    xc_old = normalize_cols(xc_old)

    lc_old = np.diag(np.random.rand(m))
    lc_old /= np.trace(lc_old)

    eps = 1e-5  # see paragraph 4.1.2
    max_iter = int(1e3)
    gamma = np.zeros(max_iter)

    for it in range(max_iter):
        wc_approx = np.linalg.multi_dot([xc_old, lc_old, xc_old.T])
        xc_new = np.zeros(xc_old.shape)
        lc_new = np.zeros(lc_old.shape)

        for k in range(m):
            for i in range(n):
                for j in range(n):
                    if wc[i, j] != 0:  # avoid divisions by zero in wc_approx
                        xc_new[i, k] += wc[i, j] * lc_old[k, k] * xc_old[j, k] / wc_approx[i, j]
                        lc_new[k, k] += wc[i, j] * xc_old[i, k] * xc_old[j, k] / wc_approx[i, j]
                xc_new[i, k] *= (2 * alpha * xc_old[i, k])
                xc_new[i, k] += (1 - alpha) * yc[i, k]
            lc_new[k, k] *= (alpha * lc_old[k, k])
            lc_new[k, k] += (1 - alpha) * sum(yc[:, k])
        xc_new = normalize_cols(xc_new)
        lc_new /= np.trace(lc_new)

        gamma[it] = alpha * kl_divergence(wc, np.linalg.multi_dot([xc_new, lc_new, xc_new.T])) + (1 - alpha) * kl_divergence(yc, xc_new.dot(lc_new))

        if it == 0:
            gamma_min = gamma[it]
        else:
            if abs((gamma[it] - gamma_min)) / gamma_min < eps:
                if is_plot:
                    plt.plot(gamma[0:it])
                    plt.ylabel(r"$\gamma$")
                    plt.xlabel("# iter")
                    plt.title("# communities: {}".format(m))
                    plt.show()
                print("====> no iter: {}".format(it))
                return xc_res, lc_res

            if gamma[it] < gamma[it - 1]:
                gamma_min = gamma[it]
                xc_res = xc_new
                lc_res = lc_new

            elif it > 2:  # cost function starts to diverge after reaching local minimum
                if is_plot:
                    plt.plot(gamma[0:it])
                    plt.ylabel(r"$\gamma$")
                    plt.xlabel("# iter")
                    plt.title("# communities: {}".format(m))
                    plt.show()
                print("====> no iter: {}".format(it))
                return xc_res, lc_res

        xc_old = xc_new
        lc_old = lc_new
    raise Exception('Maximum iteration number reached: {}'.format(max_iter))


def update_cover_varm(zc, wc, m, alpha, is_plot=False):
    """
    Update function for xc (capital X and capital L) accepting variable community number.
    Implementation of equations (9) and (10). If alpha = 1.0 (i.e. ignoring yc when calculating the community
    representation), yc is only used to determine (n, m) but its values are ignored (yc is to b filled with NaN).

    :param zc: xc(t-1) * lc(t-1) * xc(t-1).T, see paragraph 3.2.2
    :param wc: adjacency matrix, needs to be normalized with total matrix sum, see paragraph 4.2
    :param m: number of communities to which the new community representation should be fitted
    :param alpha: snap-shot vs. temporal cost weight, see paragraph 2.2
    :param is_plot: if True, a convergence plot is shown
    :return: xc_res, lc_res
    """

    n = wc.shape[0]

    xc_old = np.random.rand(n, m)
    xc_old = normalize_cols(xc_old)

    lc_old = np.diag(np.random.rand(m))
    lc_old /= np.trace(lc_old)

    eps = 1e-5  # see paragraph 4.1.2
    max_iter = int(1e3)
    gamma = np.zeros(max_iter)
    cs = np.zeros(max_iter)
    ct = np.zeros(max_iter)

    for it in range(max_iter):
        wc_approx = np.linalg.multi_dot([xc_old, lc_old, xc_old.T])
        xc_new = np.zeros(xc_old.shape)
        lc_new = np.zeros(lc_old.shape)

        for k in range(m):
            for i in range(n):
                for j in range(n):
                    xc_new[i, k] += (alpha * wc[i, j] + (1 - alpha) * zc[i, j]) * lc_old[k, k] * xc_old[j, k] / wc_approx[i, j]
                    lc_new[k, k] += (alpha * wc[i, j] + (1 - alpha) * zc[i, j]) * xc_old[i, k] * xc_old[j, k] / wc_approx[i, j]
                xc_new[i, k] *= xc_old[i, k]
            lc_new[k, k] *= lc_old[k, k]
        xc_new = normalize_cols(xc_new)
        lc_new /= np.trace(lc_new)

        temp = np.linalg.multi_dot([xc_new, lc_new, xc_new.T])
        cs[it] = kl_divergence(wc, temp)
        ct[it] = kl_divergence(zc, temp)
        # gamma[it] = alpha * kl_divergence(wc, temp) + (1 - alpha) * kl_divergence(zc, temp)
        gamma[it] = alpha * cs[it] + (1 - alpha) * ct[it]

        if it == 0:
            gamma_min = gamma[it]
        else:
            if abs((gamma[it] - gamma_min)) / gamma_min < eps:
                if is_plot:
                    plt.plot(gamma[0:it], label="gamma")
                    plt.plot(cs[0:it], label="cs")
                    plt.plot(ct[0:it], label="ct")
                    plt.legend()
                    plt.ylabel(r"$\gamma$")
                    plt.xlabel("# iter")
                    plt.title("# communities: {}".format(m))
                    plt.show()
                print("====> no iter: {}".format(it))
                return xc_res, lc_res

            if gamma[it] < gamma[it - 1]:
                gamma_min = gamma[it]
                xc_res = xc_new
                lc_res = lc_new

            elif it > 2:  # cost function starts to diverge after reaching local minimum
            # else:
                if is_plot:
                    plt.plot(gamma[0:it], label="gamma")
                    plt.plot(alpha * cs[0:it], label="cs")
                    plt.plot((1 - alpha) * ct[0:it], label="ct")
                    plt.legend()
                    plt.ylabel(r"$\gamma$")
                    plt.xlabel("# iter")
                    plt.title("# communities: {}*".format(m))
                    plt.show()
                print("====> no iter: {}".format(it))
                return xc_res, lc_res

        xc_old = xc_new
        lc_old = lc_new
    raise Exception('Maximum iteration number reached: {}'.format(max_iter))


def extract_cover(xc, lc, n, xc_prev=None, lc_prev=None):
    # Extract the community structure

    yc = xc.dot(lc)
    dc_inv = dc_inverse(yc, n)
    soft_comm = dc_inv.dot(yc)
    comm_net = np.linalg.multi_dot([lc, xc.T, soft_comm])

    if xc_prev is None:
        evol_net = None
    else:
        evol_net = np.linalg.multi_dot([lc_prev, xc_prev.T, soft_comm])

    return soft_comm, dc_inv, comm_net, evol_net


def read_edge_list(filename, weighted=False):
    # Input: Entire path to edge-list file

    print("Reading edge file {}".format(filename))
    idmap = set()
    edge_cache = {}
    with open(filename) as f:
        for line in f:
            if weighted:
                u, v, w = [int(x) for x in line.strip().split()]
            else:
                tmp = [int(x) for x in line.strip().split()]
                u, v, w = tmp[0], tmp[1], 1.0
            edge_cache[(u, v)] = w
            idmap.add(u)
            idmap.add(v)
    idmap = list(idmap)
    idmap_inv = {nid: i for i, nid in enumerate(idmap)}
    n = len(idmap)
    adj_mat = np.zeros((n, n))
    for (u, v), w in edge_cache.items():
        adj_mat[idmap_inv[u], idmap_inv[v]] = w
    adj_mat += adj_mat.T

    wc = adj_mat / adj_mat.sum()

    return idmap, idmap_inv, wc


def facetnet_step(edgelist_path, alpha, m, weighted=True, show_plot=False, xc_prev=None, lc_prev=None, idmap0=None, idmap_inv0=None):
    """
    Applies one step of the facetNet algorithm

    :param edgelist_path: Path to current edgelist
    :param alpha: Alpha cost weight
    :param weighted: True if W is weighted, False if it is binary
    :param m: Number of (a priori) communities, if m = -1, the algorithm detects the best number of communities
    :param show_plot: Bool, show convergence plot of gamma
    :param xc_prev: X(t-1) matrix
    :param lc_prev: LAMBDA(t-1) matrix
    :param idmap0: previous id map
    :param idmap_inv0: inverse previous idmap

    :return: idmap, idmap_inv, xc, lc, qc_s, soft_comm, comm_net, evol_net, m_eff
    """

    idmap, idmap_inv, wc = read_edge_list(edgelist_path, weighted)
    n = len(idmap)
    max_comm = 6

    print("Calculate network representation for {communities} communities".format(
        communities="variable number of" if m < 0 else "{}".format(m)))

    if xc_prev is None:  # time step 0
        if m > 0:
            yc = initialize_cover(n, m)
            xc, lc, = update_cover(yc, wc, 1.0, show_plot)
        else:
            qc_s_res = []
            xc_res = []
            lc_res = []
            zc = initialize_cover(n, n)
            for i in range(2, max_comm):
                xc_ret, lc_ret, = update_cover_varm(zc, wc, i, 1.0, show_plot)
                xc_res.append(xc_ret)
                lc_res.append(lc_ret)
                dc_inv = dc_inverse(xc_ret.dot(lc_ret), n)
                qc_s_res.append(soft_modularity(wc, dc_inv, xc_ret, lc_ret))
            idx_max = np.argmax(qc_s_res)
            xc = xc_res[idx_max]
            lc = lc_res[idx_max]
        soft_comm, dc_inv, comm_net, evol_net = extract_cover(xc, lc, n)

    else:
        xc = compensate_node_demography(xc_prev, idmap0, idmap, idmap_inv0)
        xc_prev = xc  # need demography compensated version of xc_prev to calculate evol_net
        if m > 0:
            yc = xc.dot(lc_prev)
            xc, lc, = update_cover(yc, wc, alpha, show_plot)
        else:
            qc_s_res = []
            xc_res = []
            lc_res = []
            zc = np.linalg.multi_dot([xc, lc_prev, xc.T])
            for i in range(2, max_comm):
                xc_ret, lc_ret, = update_cover_varm(zc, wc, i, alpha, show_plot)
                xc_res.append(xc_ret)
                lc_res.append(lc_ret)
                dc_inv = dc_inverse(xc_ret.dot(lc_ret), n)
                qc_s_res.append(soft_modularity(wc, dc_inv, xc_ret, lc_ret))
            idx_max = np.argmax(qc_s_res)
            xc = xc_res[idx_max]
            lc = lc_res[idx_max]
        soft_comm, dc_inv, comm_net, evol_net = extract_cover(xc, lc, n, xc_prev, lc_prev)

    if m > 0:
        qc_s_res = soft_modularity(wc, dc_inv, xc, lc)
    else:
        qc_s = max(qc_s_res)
        if show_plot:
            no_iter = np.arange(2, max_comm, 1)
            plt.plot(no_iter, qc_s_res)
            plt.ylabel("Qs")
            plt.xlabel("# communities")
            plt.title(r"$Q_s,max = $ {} for {} communities".format(qc_s, idx_max + 2))
            plt.show()
            # plt.savefig("{}_qc.png".format(edgelist_path))

    # qc_s_alt = soft_modularity_alt(soft_comm, wc)  # DEBUG

    if m < 0:
        m_eff = idx_max + 2
    else:
        m_eff = m

    return idmap, idmap_inv, xc, lc, qc_s_res, soft_comm, comm_net, evol_net, m_eff