123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282 |
- from socketserver import ThreadingUnixStreamServer
- import pandas as pd
- import numpy as np
- import torch
- from sklearn.model_selection import train_test_split, KFold
- from scipy.special import softmax
- import argparse
- import pickle
- from os.path import join as opj, basename
- import seaborn as sns
- from matplotlib import pyplot as plt
- import matplotlib
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "xelatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- plt.rcParams["text.latex.preamble"].join([
- r"\usepackage{amsmath}",
- r"\setmainfont{amssymb}",
- ])
- from multiprocessing import Pool
- from functools import partial
- class TrajectoryModel(torch.nn.Module):
- def __init__(self, N, R, C, nu):
- super().__init__()
- self.N = N
- self.R = R
- self.C = C
- self.nu = nu
- self.dtype = torch.float
- self.init_weights()
- torch.autograd.set_detect_anomaly(True)
- if torch.cuda.is_available():
- self.device = torch.device("cuda")
- self.to(self.device)
- else:
- print("GPU is not available, using CPU instead")
- def init_weights(self):
- self.beta = torch.nn.Parameter(
- torch.zeros((self.N, self.R, self.C))
- )
- self.mu = torch.nn.Parameter(torch.zeros((self.R, self.C-1)))
- self.gamma = torch.nn.Parameter(torch.zeros((self.R, self.C)))
- self.delta = torch.nn.Parameter(torch.zeros((self.R, self.C)))
- self.eps = torch.nn.Parameter(
- torch.zeros((self.N, self.R, self.C-1))
- )
- self.sigma = torch.nn.Parameter(torch.zeros((R,C-1)))
- self.mu_nu = torch.nn.Parameter(torch.zeros(1))
- def train(self, train, validation, epoch=50, autostop=False, printouts=1000):
- optimizer = torch.optim.Adam(self.parameters(), lr=0.01)
- # clipping_value = 1e6
- epochs = []
- train_loss = []
- validation_loss = []
- reference_loss = []
- for t in range(epoch):
- # print("mu:", self.mu)
-
- loss = self.loss(train)
- print(t, loss.item())
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- if (t % 2) == 0:
- # beta = (torch.einsum("ld,lij->dij", self.M, self.lambd)).detach().numpy()
- y_pred_train = self.predict(train).detach().numpy()
- y_pred_validation = self.predict(validation).detach().numpy()
- epochs.append(t)
- train_loss.append((np.abs(train["y"]-y_pred_train).sum(axis=1)/2).mean())
- validation_loss.append((np.abs(validation["y"]-y_pred_validation).sum(axis=1)/2).mean())
- reference_loss.append((np.abs(validation["y"]-validation["x"]).sum(axis=1)/2).mean())
- if (t % printouts) == 0:
- fig, ax = plt.subplots(figsize=[6.4*0.75, 4.8*0.75])
- ax.plot(np.array(epochs), train_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{y}_a^{\\mathrm{pred}})$ -- training set")
- ax.plot(np.array(epochs), validation_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{y}_a^{\\mathrm{pred}})$ -- test set")
- ax.plot(np.array(epochs), reference_loss, label="$d_{\\mathrm{TV}}(\\vec{y}_a,\\vec{x}_a)$ -- test set")
- ax.set_xlabel("Epochs")
- ax.set_ylabel("Performance (total variation distance)")
- ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2))
- ax.set_ylim(0.3,0.6)
- fig.savefig(f"status_{basename(args.input)}.eps", bbox_inches="tight")
- if autostop and len(validation_loss)>2 and validation_loss[-1]>validation_loss[-2] and validation_loss[-2]>validation_loss[-3]:
- break
- return train_loss, validation_loss, reference_loss
- def predict(self, data, eps=None):
- N = data["N"]
- mu = torch.zeros((self.R, self.C))
- mu[:,:-1] = self.mu
- mu += self.nu*self.mu_nu
- s = torch.zeros((N, self.R, self.C))
- s = s+torch.einsum("ij,aj->aij", self.gamma, data["expertise"])
- s = s+torch.einsum("ij,aj->aij", self.delta, data["cov"])
- s = s+mu
- if eps is not None:
- eps_ = torch.zeros((N, self.R, self.C))
- eps_[:,:,:-1] = eps
- s += eps_
- b = torch.softmax(s, dim=2)
- p = torch.einsum("aij,ai->aj", b, data["x"])
- return p
- def loss(self, data):
- Y = data["Y"]
-
- loss = 0
- p = self.predict(data, self.eps)
-
- for a in range(p.shape[0]):
- multi = torch.distributions.multinomial.Multinomial(
- total_count=Y[a,:].sum().max().item(),
- probs=p[a,:]
- )
- loss -= multi.log_prob(Y[a,:]).sum()
- print("evidence loss: ", loss/data["N"])
- eps_prior = torch.distributions.normal.Normal(0, self.sigma.exp())
- sigma_prior = torch.distributions.exponential.Exponential(1)
- normal_prior = torch.distributions.normal.Normal(0, 1)
- priors_loss = 0
- priors_loss -= eps_prior.log_prob(self.eps).sum()
- priors_loss -= sigma_prior.log_prob(self.sigma.exp()).sum()
- priors_loss -= normal_prior.log_prob(self.mu).sum()
- priors_loss -= normal_prior.log_prob(self.delta).sum()
- priors_loss -= normal_prior.log_prob(self.gamma).sum()
- priors_loss -= normal_prior.log_prob(self.mu_nu).sum()
-
- print("priors loss:", priors_loss/data["N"])
- loss += priors_loss
- loss /= data["N"]
- return loss
- parser = argparse.ArgumentParser()
- parser.add_argument("--input")
- parser.add_argument("--portfolios")
- parser.add_argument("--resources", default=None)
- parser.add_argument("--output")
- parser.add_argument("--folds", default=0, type=int)
- args = parser.parse_args()
- n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
- df = pd.read_csv(args.portfolios)
- df = df[df[[f"start_{k+1}" for k in range(n_topics)]].sum(axis=1) >= 100]
- resources = pd.read_parquet(
- opj(args.input, "pooled_resources.parquet") if args.resources is None else args.resources
- )
- df = df.merge(resources, left_on="bai", right_on="bai")
- data = {
- "NR": np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int),
- "NC": np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int),
- "expertise": np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].fillna(0).values),
- }
- data["cov"] = np.stack(df["pooled_resources"])
- junk = np.sum(data["NR"] + data["NC"], axis=0) == 0
- for col in ["NR", "NC", "cov", "expertise"]:
- data[col] = data[col][:, ~junk]
- R = n_topics-junk.sum()
- C = n_topics-junk.sum()
- data["cov"] = np.nan_to_num(data["cov"])# / np.maximum(data["cov"].sum(axis=1)[:, np.newaxis], 1)
- data["expertise"] = np.nan_to_num(data["expertise"])# / np.maximum(data["cov"].sum(axis=1)[:, np.newaxis], 1)
- expertise = data["expertise"]
- nu = np.array([
- [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(R)]
- for i in range(R)
- ])
- data["Y"] = data["NC"]
- data["x"] = data["NR"]/data["NR"].sum(axis=1)[:,np.newaxis]
- data["y"] = data["NC"]/data["NC"].sum(axis=1)[:,np.newaxis]
- N = data["x"].shape[0]
- def split_train_validation(data, test_size):
- train_ind, test_ind = train_test_split(np.arange(N), test_size=test_size)
- train, validation = {}, {}
- for k in data:
- train[k] = torch.from_numpy(data[k][train_ind])
- validation[k] = torch.from_numpy(data[k][test_ind])
- return train, validation
- def folds(data, folds):
- f = []
- kf = KFold(n_splits=folds, shuffle=True)
- for i, (train_ind, test_ind) in enumerate(kf.split(np.arange(N))):
- fold_train, fold_test = {}, {}
- for k in data:
- fold_train[k] = torch.from_numpy(data[k][train_ind])
- fold_test[k] = torch.from_numpy(data[k][test_ind])
- f.append((fold_train, fold_test))
- return f
- def run_model(data):
- data[0]["N"] = data[0]["x"].shape[0]
- data[1]["N"] = data[1]["x"].shape[0]
- mdl = TrajectoryModel(data[0]["N"], R, C, torch.from_numpy(nu))
- train_loss, validation_loss, reference_loss = mdl.train(
- data[0], data[1],
- epoch=1500,
- autostop=True,
- printouts=50
- )
- scores = [
- train_loss[-1].detach().numpy(), validation_loss[-1].detach().numpy(), reference_loss[-1].detach().numpy()
- ]
- print(scores)
- return scores
- if args.folds > 0:
- f = folds(data, args.folds)
- with Pool(processes=args.folds) as pool:
- scores = pool.map(run_model, f)
- scores = np.array(scores)
- np.save(opj(args.input, f"scores_{args.output}.npy"), scores)
- else:
- train, validation = split_train_validation(data, test_size=0.2)
- train["N"] = train["x"].shape[0]
- validation["N"] = validation["x"].shape[0]
- mdl = TrajectoryModel(train["N"], R, C, torch.from_numpy(nu))
- mdl.train(train, validation, epoch=800)
|