from socketserver import ThreadingUnixStreamServer import pandas as pd import numpy as np import torch from sklearn.model_selection import train_test_split, KFold from scipy.special import softmax import argparse import pickle from os.path import join as opj, basename import seaborn as sns from matplotlib import pyplot as plt import matplotlib matplotlib.use("pgf") matplotlib.rcParams.update( { "pgf.texsystem": "xelatex", "font.family": "serif", "font.serif": "Times New Roman", "text.usetex": True, "pgf.rcfonts": False, } ) plt.rcParams["text.latex.preamble"].join([ r"\usepackage{amsmath}", r"\setmainfont{amssymb}", ]) from multiprocessing import Pool from functools import partial class TrajectoryModel(torch.nn.Module): def __init__(self, N, R, C, nu): super().__init__() self.N = N self.R = R self.C = C self.nu = nu self.dtype = torch.float self.init_weights() torch.autograd.set_detect_anomaly(True) if torch.cuda.is_available(): self.device = torch.device("cuda") self.to(self.device) else: print("GPU is not available, using CPU instead") def init_weights(self): self.beta = torch.nn.Parameter( torch.zeros((self.N, self.R, self.C)) ) self.mu = torch.nn.Parameter(torch.zeros((self.R, self.C-1))) self.gamma = torch.nn.Parameter(torch.zeros((self.R, self.C))) self.delta = torch.nn.Parameter(torch.zeros((self.R, self.C))) self.eps = torch.nn.Parameter( torch.zeros((self.N, self.R, self.C-1)) ) self.sigma = torch.nn.Parameter(torch.zeros((R,C-1))) self.mu_nu = torch.nn.Parameter(torch.zeros(1)) def train(self, train, validation, epoch=50, autostop=False, printouts=1000): optimizer = torch.optim.Adam(self.parameters(), lr=0.01) # clipping_value = 1e6 epochs = [] train_loss = [] validation_loss = [] reference_loss = [] for t in range(epoch): # print("mu:", self.mu) loss = self.loss(train) print(t, loss.item()) optimizer.zero_grad() loss.backward() optimizer.step() if (t % 2) == 0: # beta = (torch.einsum("ld,lij->dij", self.M, self.lambd)).detach().numpy() y_pred_train = self.predict(train).detach().numpy() y_pred_validation = self.predict(validation).detach().numpy() epochs.append(t) train_loss.append((np.abs(train["y"]-y_pred_train).sum(axis=1)/2).mean()) validation_loss.append((np.abs(validation["y"]-y_pred_validation).sum(axis=1)/2).mean()) reference_loss.append((np.abs(validation["y"]-validation["x"]).sum(axis=1)/2).mean()) if (t % printouts) == 0: fig, ax = plt.subplots(figsize=[6.4*0.75, 4.8*0.75]) ax.plot(np.array(epochs), train_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{y}_a^{\\mathrm{pred}})$ -- training set") ax.plot(np.array(epochs), validation_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{y}_a^{\\mathrm{pred}})$ -- test set") ax.plot(np.array(epochs), reference_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{x}_a)$ -- test set") ax.set_xlabel("Epochs") ax.set_ylabel("Performance (total variation distance)") ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2)) ax.set_ylim(0.3,0.6) fig.savefig(f"status_{basename(args.input)}.eps", bbox_inches="tight") if autostop and len(validation_loss)>2 and validation_loss[-1]>validation_loss[-2] and validation_loss[-2]>validation_loss[-3]: break return train_loss, validation_loss, reference_loss def predict(self, data, eps=None): N = data["N"] mu = torch.zeros((self.R, self.C)) mu[:,:-1] = self.mu mu += self.nu*self.mu_nu s = torch.zeros((N, self.R, self.C)) s = s+torch.einsum("ij,aj->aij", self.gamma, data["expertise"]) s = s+torch.einsum("ij,aj->aij", self.delta, data["cov"]) s = s+mu if eps is not None: eps_ = torch.zeros((N, self.R, self.C)) eps_[:,:,:-1] = eps s += eps_ b = torch.softmax(s, dim=2) p = torch.einsum("aij,ai->aj", b, data["x"]) return p def loss(self, data): Y = data["Y"] loss = 0 p = self.predict(data, self.eps) for a in range(p.shape[0]): multi = torch.distributions.multinomial.Multinomial( total_count=Y[a,:].sum().max().item(), probs=p[a,:] ) loss -= multi.log_prob(Y[a,:]).sum() print("evidence loss: ", loss/data["N"]) eps_prior = torch.distributions.normal.Normal(0, self.sigma.exp()) sigma_prior = torch.distributions.exponential.Exponential(1) normal_prior = torch.distributions.normal.Normal(0, 1) priors_loss = 0 priors_loss -= eps_prior.log_prob(self.eps).sum() priors_loss -= sigma_prior.log_prob(self.sigma.exp()).sum() priors_loss -= normal_prior.log_prob(self.mu).sum() priors_loss -= normal_prior.log_prob(self.delta).sum() priors_loss -= normal_prior.log_prob(self.gamma).sum() priors_loss -= normal_prior.log_prob(self.mu_nu).sum() print("priors loss:", priors_loss/data["N"]) loss += priors_loss loss /= data["N"] return loss parser = argparse.ArgumentParser() parser.add_argument("--input") parser.add_argument("--folds", default=0, type=int) args = parser.parse_args() n_topics = len(pd.read_csv(opj(args.input, "topics.csv"))) df = pd.read_csv(opj(args.input, "aggregate.csv")) df = df[df[[f"start_{k+1}" for k in range(n_topics)]].sum(axis=1) >= 100] resources = pd.read_parquet(opj(args.input, "pooled_resources.parquet")) df = df.merge(resources, left_on="bai", right_on="bai") data = { "NR": np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int), "NC": np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int), "expertise": np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].fillna(0).values), } data["cov"] = np.stack(df["pooled_resources"]) junk = np.sum(data["NR"] + data["NC"], axis=0) == 0 for col in ["NR", "NC", "cov", "expertise"]: data[col] = data[col][:, ~junk] R = n_topics-junk.sum() C = n_topics-junk.sum() data["cov"] = np.nan_to_num(data["cov"])# / np.maximum(data["cov"].sum(axis=1)[:, np.newaxis], 1) data["expertise"] = np.nan_to_num(data["expertise"])# / np.maximum(data["cov"].sum(axis=1)[:, np.newaxis], 1) expertise = data["expertise"] nu = np.array([ [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(R)] for i in range(R) ]) data["Y"] = data["NC"] data["x"] = data["NR"]/data["NR"].sum(axis=1)[:,np.newaxis] data["y"] = data["NC"]/data["NC"].sum(axis=1)[:,np.newaxis] N = data["x"].shape[0] def split_train_validation(data, test_size): train_ind, test_ind = train_test_split(np.arange(N), test_size=test_size) train, validation = {}, {} for k in data: train[k] = torch.from_numpy(data[k][train_ind]) validation[k] = torch.from_numpy(data[k][test_ind]) return train, validation def folds(data, folds): f = [] kf = KFold(n_splits=folds, shuffle=True) for i, (train_ind, test_ind) in enumerate(kf.split(np.arange(N))): fold_train, fold_test = {}, {} for k in data: fold_train[k] = torch.from_numpy(data[k][train_ind]) fold_test[k] = torch.from_numpy(data[k][test_ind]) f.append((fold_train, fold_test)) return f def run_model(data): data[0]["N"] = data[0]["x"].shape[0] data[1]["N"] = data[1]["x"].shape[0] mdl = TrajectoryModel(data[0]["N"], R, C, torch.from_numpy(nu)) train_loss, validation_loss, reference_loss = mdl.train( data[0], data[1], epoch=1000, autostop=True, printouts=50 ) scores = [ train_loss[-1].detach().numpy(), validation_loss[-1].detach().numpy(), reference_loss[-1].detach().numpy() ] print(scores) return scores if args.folds > 0: f = folds(data, args.folds) with Pool(processes=args.folds) as pool: scores = pool.map(run_model, f) scores = np.array(scores) np.save(opj(args.input, f"scores.npy"), scores) else: train, validation = split_train_validation(data, test_size=0.2) train["N"] = train["x"].shape[0] validation["N"] = validation["x"].shape[0] mdl = TrajectoryModel(train["N"], R, C, torch.from_numpy(nu)) mdl.train(train, validation, epoch=800)