Source code for dynamicgem.embedding.dynamicTriad

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import sys
import tensorflow as tf
import operator
import time
import os
import importlib
import random

from six.moves import cPickle
from keras import backend as KBack
from os.path import isfile
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.linear_model import LogisticRegression

try:
    from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
except ImportError:
    from sklearn.cross_validation import cross_val_score, KFold, StratifiedKFold
from sklearn import svm
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


from dynamicgem.embedding.static_graph_embedding import StaticGraphEmbedding
from dynamicgem.utils import graph_util, plot_util, dataprep_util
from dynamicgem.evaluation import visualize_embedding as viz
from dynamicgem.utils.sdne_utils import *
from dynamicgem.graph_generation import dynamic_SBM_graph
from dynamicgem.utils.dynamictriad_utils import *
import dynamicgem.utils.dynamictriad_utils.dataset.dataset_utils as du
import dynamicgem.utils.dynamictriad_utils.algorithm.embutils as eu
from dynamicgem.evaluation import evaluate_link_prediction as lp


[docs]class dynamicTriad(StaticGraphEmbedding):
    """ Dynamic Triad Closure based embedding
        
        DynamicTriad preserves both structural informa- tion 
        and evolution patterns of a given network. The general 
        idea of our approach is to impose triad, which is a group 
        of three vertices and is one of the basic units of networks.

        Args:
            niters (int): Number of iteration to run the algorithm
            starttime (int): start time for the graph step
            datafile (str) : The file for the input graph
            batchsize (int): batch size for training the algorithm
            nsteps (int) : total number of steps in the temporal graph
            embdim (int): embedding dimension
            stepsize (int): step size for the graph
            stepstride (int): stride to consider for temporal stride
            outdir (str): The output directory to store the result
            cachefn (str): Directory to cache the temporary data
            lr (float): Learning rate for the algorithm
            beta (float): coefficients for triad component
            negdup (float): neg/pos ratio during sampling
            datasetmod (str): module name for dataset loading
            trainmod (str): module name for training model 
            pretrain_size (int): size of the  graph for pre-training 
            sampling_args (int): sampling size
            validation (list): link_reconstruction validation data
            datatype (str): type of network data 
            scale (int): scaling 
            classifier (str): type of classifier to be used
            debug (bool): debugging flag
            test (bool): type of test to perform
            repeat (int): Number of times to repeat the learning
            resultdir (str): directory to store the result
            testDataType (str): type of test data
            clname (str) : classifier type 
            node_num (int): number of nodes
        
        Examples:
            >>> from dynamicgem.embedding.dynamicTriad import dynamicTriad
            >>> from dynamicgem.graph_generation import dynamic_SBM_graph
            >>> node_num = 200
            >>> community_num = 2
            >>> node_change_num = 2
            >>> length =5
            >>> dynamic_sbm_series = dynamic_SBM_graph.get_community_diminish_series_v2(node_num,
                                                                                    community_num,
                                                                                    length,
                                                                                    1,
                                                                                    node_change_num)
            >>> graphs = [g[0] for g in dynamic_sbm_series]

            >>> datafile = dataprep_util.prep_input_dynTriad(graphs, length, args.testDataType)

            >>> embedding = dynamicTriad(niters=10,
                                     starttime=0,
                                     datafile=datafile,
                                     batchsize=10,
                                     nsteps=5,
                                     embdim=16,
                                     stepsize=1,
                                     stepstride=1,
                                     outdir='./output',
                                     cachefn='./tmp',
                                     lr=0.001,
                                     beta=0.1,
                                     negdup=1,
                                     datasetmod='dynamicgem.utils.dynamictriad_utils.dataset.adjlist',
                                     trainmod='dynamicgem.utils.dynamictriad_utils.algorithm.dynamic_triad',
                                     pretrain_size=4,
                                     sampling_args={},
                                     validation='link_reconstruction',
                                     datatype='sbm_cd',
                                     scale=1,
                                     classifier='lr',
                                     debug=False,
                                     test='link_predict',
                                     repeat=1,
                                     resultdir='./results_link_all',
                                     testDataType='sbm_cd',
                                     clname='lr',
                                     node_num=node_num )

            >>> embedding.learn_embedding()
            >>> embedding.get_embedding()
            >>> outdir = args.resultdir
            >>> if not os.path.exists(outdir):
            >>>     os.mkdir(outdir)
            >>> outdir = outdir + '/' + args.testDataType
            >>> if not os.path.exists(outdir):
            >>>     os.mkdir(outdir)
            >>> outdir = outdir + '/' + 'dynTRIAD'
            >>> if not os.path.exists(outdir):
             >>>    os.mkdir(outdir)

            >>> lp.expstaticLP_TRIAD(dynamic_sbm_series,
                                 graphs,
                                 embedding,
                                 1,
                                 outdir + '/',
                                 'nm' + str(args.nodemigration) + '_l' + str(args.nsteps) + '_emb' + str(args.embdim),
                                 )
    """

    def __init__(self, *hyper_dict, **kwargs):
        hyper_params = {
            'method_name': 'Dynamic TRIAD',
            'modelfile': None,
            'weightfile': None,
            'savefilesuffix': None

        }
        hyper_params.update(kwargs)
        for key in hyper_params.keys():
            self.__setattr__('_%s' % key, hyper_params[key])
        for dictionary in hyper_dict:
            for key in dictionary:
                self.__setattr__('_%s' % key, dictionary[key])
        self.clf = self.__make_classifier()
        self._model = None
        # self._clname='lr'       

    def __make_classifier(self):
        """Function to initialize the classifier"""
        class_weight = 'balanced'

        if self._clname == 'svm':
            return svm.SVC(kernel='linear', class_weight=class_weight)
        elif self._clname == 'lr':
            return LogisticRegression(class_weight=class_weight)
        else:
            raise NotImplementedError()

[docs]    def load_trainmod(self, modname):
        """Function to load the training module"""
        mod = importlib.import_module(modname)
        return getattr(mod, 'Model')

[docs]    def load_datamod(self, modname):
        """Function to load the dataset module"""
        mod = importlib.import_module(modname)
        return getattr(mod, 'Dataset')

[docs]    def load_or_update_cache(self, ds, cachefn):
        """Function to either update or load the cache"""
        if cachefn is None:
            return
        cachefn += '.cache'
        if isfile(cachefn + '.args'):
            args = cPickle.load(open(cachefn + '.args', 'r'))
            try:
                ds.load_cache(args, lambda: cPickle.load(open(cachefn, 'r')))
                print("Data loaded from cache file {}".format(cachefn))
                return
            except (ValueError, EOFError) as e:
                print("Failed to load cache file {}: {}".format(cachefn, e.message))

        # update cache
        print("updating cache file for prefix {}".format(cachefn))
        ar, args = ds.cache()
        cPickle.dump(args, open(cachefn + '.args', 'w'))
        cPickle.dump(ar, open(cachefn, 'w'))
        print("cache file {} updated".format(cachefn))

[docs]    def export(self, vertices, data, outdir):
        """function to export the data"""
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        outdir = outdir + '/' + self._datatype
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        outdir = outdir + '/dynTriad'
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        for i in range(len(data)):
            assert len(vertices) == len(data[i]), (len(vertices), len(data[i]))
            fn = "{}/{}.out".format(outdir, i)
            fh = open(fn, 'w')
            for j in range(len(vertices)):
                print("{} {}".format(vertices[j], ' '.join(["{:.3f}".format(d) for d in data[i][j]])), file=fh)
            fh.close()

[docs]    def load_embedding(self, fn, vs):
        """Function to load the embedding"""
        data = open(fn, 'r').read().rstrip('\n').split('\n')
        emb = {}
        for line in data:
            fields = line.split()
            emb[fields[0]] = [float(e) for e in fields[1:]]
        # it is possible that the output order differs from :param vs: given different node_type,
        # so we have to reorder the embedding according to :param vs:
        emb = [emb[str(v)] for v in vs]

        return np.vstack(emb)

[docs]    def get_method_name(self):
        """Function to return the method name.
            
           Returns:
                String: Name of the method.
        """
        return self._method_name

[docs]    def get_method_summary(self):
        """Function to return the summary of the algorithm. 
           
           Returns:
                String: Method summary
        """
        return '%s_%d' % (self._method_name)

[docs]    def learn_embedding(self):
        """Learns the embedding of the nodes.
           
            Returns:
                List: Node embeddings and time taken by the algorithm
        """
        # TensorFlow wizardry
        config = tf.ConfigProto()

        # Don't pre-allocate memory; allocate as-needed
        config.gpu_options.allow_growth = True

        # Only allow a total of half the GPU memory to be allocated
        config.gpu_options.per_process_gpu_memory_fraction = 0.2

        # Create a session to pass the above configuration
        sess = tf.Session(config=config)

        # Create a tensorflow debugger wrapper
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess) 

        # Create a session with the above options specified.
        KBack.tensorflow_backend.set_session(sess)

        TrainModel = self.load_trainmod(self._trainmod)
        Dataset = self.load_datamod(self._datasetmod)

        ds = Dataset(self._datafile, self._starttime, self._nsteps, stepsize=self._stepsize,
                     stepstride=self._stepstride)
        #         self.load_or_update_cache(ds, self._cachefn)
        # dsargs = {'datafile': self._datafile, 'starttime': self._starttime, 'nsteps': self._nsteps,
        #           'stepsize': self._stepsize, 'stepstride': self._stepstride, 'datasetmod': self._datasetmod}
        tm = TrainModel(ds, pretrain_size=self._pretrain_size, embdim=self._embdim, beta=self._beta,
                        lr=self._lr, batchsize=self._batchsize, sampling_args=self._sampling_args)

        edgecnt = [g.number_of_edges() for g in ds.gtgraphs]
        k_edgecnt = sum(edgecnt[:self._pretrain_size])
        print("{} edges in pretraining graphs".format(k_edgecnt))

        if self._pretrain_size > 0:
            initstep = int(ds.time2step(self._starttime))
            tm.pretrain_begin(initstep, initstep + self._pretrain_size)

            print("generating validation set")
            validargs = tm.dataset.sample_test_data(self._validation, initstep, initstep + self._pretrain_size,
                                                    size=10000)
            # print(validargs)
            print("{} validation samples generated".format(len(validargs[0])))

            max_val, max_idx, maxmodel = -1, 0, None

            # for early stopping
            start_time = time.time()
            scores = []
            for i in range(self._niters):
                tm.pretrain_begin_iteration()

                epoch_loss = 0
                for batidx, bat in enumerate(tm.batches(self._batchsize)):
                    inputs = tm.make_pretrain_input(bat)
                    l = tm.pretrain['lossfunc'](inputs)
                    if isinstance(l, (list, tuple)):
                        l = l[0]
                    epoch_loss += l
                    print("\repoch {}: {:.0%} completed, cur loss: {:.3f}".format(i, float(batidx * self._batchsize)
                                                                                  / tm.sample_size(), l.flat[0]),
                          end='')
                    sys.stdout.flush()
                tm.pretrain_end_iteration()

                print(" training completed, total loss {}".format(epoch_loss), end='')

                # without validation, the model exists only after I iterations
                if self._validation != 'none':
                    val_score = tm.validate(self._validation, *validargs)

                    if val_score > max_val:
                        max_val = val_score
                        max_idx = i
                        maxmodel = tm.save_model()
                    print(", validation score {:.3f}".format(val_score))
                else:
                    max_idx, max_val = i, epoch_loss
                    # maxmodel is not saved here in order to save time
                    print("")

                if self._validation != 'none':
                    scores.append(val_score)
                    if max_val > 0 and i - max_idx > 5:
                        break

            print("best validation score at itr {}: {}".format(max_idx, max_val))
            print("{} seconds elapsed for pretraining".format(time.time() - start_time))
            # lastmodel = tm.save_model()  # for debug
            print("saving output to {}".format(self._outdir))
            tm.restore_model(maxmodel)
            tm.pretrain_end()
            self.export(list(tm.dataset.mygraphs['any'].nodes()), tm.export(), self._outdir)

        # online training disabled
        startstep = int(tm.dataset.time2step(self._starttime))
        for y in range(startstep + self._pretrain_size, startstep + self._nsteps):
            raise NotImplementedError()

[docs]    def get_embedding(self):
        """Function to return the embeddings"""
        self._X = dataprep_util.getemb_dynTriad(self._outdir + '/' + self._testDataType + '/dynTriad', self._nsteps,
                                                self._embdim)
        return self._X

[docs]    def get_edge_weight(self, t, i, j):
        """Function to get edge weight.
           
            Attributes:
              i (int): source node for the edge.
              j (int): target node for the edge.
              embed (Matrix): Embedding values of all the nodes.
              filesuffix (str): File suffix to be used to load the embedding.

            Returns:
                Float: Weight of the given edge.
        """
        try:
            feat = np.fabs(self._X[t][i, :] - self._X[t][j, :])
            return self._model.predict(np.reshape(feat, [1, -1]))[0]
        except:
            pdb.set_trace()

[docs]    def get_reconstructed_adj(self, t, X=None, node_l=None):
        """Function to reconstruct the adjacency list for the given node.
           
            Attributes:
              node_l (int): node for which the adjacency list will be created.
              X (Matrix): Embedding values of all the nodes.
              t (int): Time step

            Returns:
                List : Adjacency list of the given node.
        """
        if X is not None:
            node_num = X.shape[0]
            # self._X = X
        else:
            node_num = self._node_num
        adj_mtx_r = np.zeros((node_num, node_num))
        for v_i in range(node_num):
            for v_j in range(node_num):
                if v_i == v_j:
                    continue
                adj_mtx_r[v_i, v_j] = self.get_edge_weight(t, v_i, v_j)
        return adj_mtx_r

[docs]    def sample_link_reconstruction(self, g, sample_nodes=None, negdup=1):
        """Function to sample the link reconstruction"""
        pos = []
        # assert not g.is_directed()
        # for g in graphs:
        for e in g.edges():
            if int(e[0]) > int(e[1]):
                # check symmetric
                names = list(g.nodes())
                assert g.edges(e[0], e[1]), "{}: {} {}".format(names[e[0]],
                                                               names[e[1]])
                continue
            pos.append([int(e[0]), int(e[1])])
        pos = np.vstack(pos).astype('int32')

        neg = []
        vsize = len(g.nodes())
        nodenames = list(g.nodes())
        for i in range(negdup):
            for p in pos:
                src, tgt = p
                # g = self.mygraphs[tm + intv]
                assert g.out_degree(nodenames[src]) < vsize - 1 or g.out_degree(nodenames[tgt]) < vsize - 1, \
                    "We do not expect any node to connect to all other nodes"

                while True:
                    if random.randint(0, 1) == 0:  # replace source
                        # cur_range = negrange[tm][tgt]
                        # new_src = cur_range[random.randint(0, len(cur_range) - 1)]
                        new_src = random.randint(0, vsize - 1)
                        if not g.has_edge(nodenames[new_src], nodenames[tgt]):
                            neg.append([new_src, tgt])
                            break
                    else:  # replace target
                        # cur_range = negrange[tm][src]
                        # new_tgt = cur_range[random.randint(0, len(cur_range) - 1)]
                        new_tgt = random.randint(0, vsize - 1)
                        if not g.has_edge(nodenames[src], nodenames[new_tgt]):
                            neg.append([src, new_tgt])
                            break
        neg = np.vstack(neg).astype('int32')

        lbs = np.concatenate((np.ones(len(pos)), -np.ones(len(neg))))
        return np.concatenate((pos, neg), axis=0), lbs

[docs]    class ResultPresenter(object):
        """result presenter class"""
        def __init__(self):
            self.f1, self.prec, self.rec, self.acc = [], [], [], []

        def add_result(self, res):
            self.prec.extend(res[0])
            self.rec.extend(res[1])
            self.f1.extend(res[2])
            self.acc.extend(res[3])

        def show_result(self):
            print("precision mean: {} std: {}".format(np.mean(self.prec), np.std(self.prec)))
            print("recall mean: {} std: {}".format(np.mean(self.rec), np.std(self.rec)))
            print("f1 mean: {} std: {}".format(np.mean(self.f1), np.std(self.f1)))
            print("accuracy mean: {} std: {}".format(np.mean(self.acc), np.std(self.acc)))

    def __classify(self, feat, lbs):
        sm = None

        poscnt, negcnt = np.sum(lbs == 1), np.sum(lbs == -1)
        print("classifying with pos:neg = {}:{}".format(poscnt, negcnt))

        try:
            cv = StratifiedKFold(n_splits=5, shuffle=True)
            parts = cv.split(feat, lbs)
        except TypeError:
            cv = StratifiedKFold(lbs, n_folds=5, shuffle=True)
            parts = cv

        f1, prec, rec, acc = [], [], [], []
        for tr, te in parts:
            if sm is not None:
                x, y = sm.fit_sample(feat[tr], lbs[tr])
                # x, y = feat[tr], lbs[tr]
            else:
                x, y = feat[tr], lbs[tr]
            model = self.clf.fit(x, y)
            p = model.predict(feat[te])
            # self._model=model
            # if self.debug:
            #     print("results:", p, lbs[te])
            # print(p,np.shape(p))
            f1.append(f1_score(lbs[te], p))
            prec.append(precision_score(lbs[te], p))
            rec.append(recall_score(lbs[te], p))
            acc.append(accuracy_score(lbs[te], p))
        # idx = np.random.permutation(len(lbs))
        # x,y = feat[idx], lbs[idx]
        # self._model=self.clf.fit(x, y)    
        return prec, rec, f1, acc

[docs]    def link_predict(self, g, t, intv=0, repeat=1):
        """Function to perform link prediction"""
        samp, lbs = self.sample_link_reconstruction(g, sample_nodes=None, negdup=1)
        # pdb.set_trace()
        # TODO: different feature generation method might be used here
        try:
            feat = np.fabs(self._X[t][samp[:, 0]] - self._X[t][samp[:, 1]])
        except:
            pdb.set_trace()
        print("feature shape {}".format(feat.shape))

        # rp = self.ResultPresenter()
        # for i in range(repeat):
        #     res = self.__classify(feat, lbs)
        #     rp.add_result(res)
        # rp.show_result()

        idx = np.random.permutation(len(lbs))
        x, y = feat[idx], lbs[idx]
        self._model = self.clf.fit(x, y)

[docs]    def predict_next_adj(self, t, node_l=None):
        """Function to predict the next adjacency for the given node.
           
            Attributes:
              node_l (int): node for which the adjacency list will be created.

            Returns:
                List: Reconstructed adjancey list.
        """
        if node_l is not None:
            return self.get_reconstructed_adj(t, node_l)
        else:
            return self.get_reconstructed_adj(t)

[docs]    def plotresults(self, dynamic_sbm_series):
        """Function to plot the result"""
        plt.figure()
        plt.clf()
        viz.plot_static_sbm_embedding(self._X[-4:], dynamic_sbm_series[-4:])