#!/usr/bin/env python # coding: utf-8 import pandas as pd import numpy as np import networkx as nx from ipysigma import Sigma from matplotlib import pyplot as plt import seaborn as sns import pickle from os.path import join as opj import argparse parser = argparse.ArgumentParser() parser.add_argument("--location") parser.add_argument("--dataset", default="inspire-harvest/database") parser.add_argument("--keywords-threshold", type=int, default=200) parser.add_argument("--articles-threshold", type=int, default=5) parser.add_argument("--late-periods", nargs="+", type=int, default=[3]) # [2,3] for ACL, [3] for HEP args = parser.parse_args() topics = pd.read_csv(opj(args.location, "topics.csv"))["label"].tolist() topic_matrix = np.load(opj(args.location, "topics_counts.npy")) articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "title"]] articles = articles[articles["date_created"].str.len() >= 4] if "years" not in articles.columns: articles["year"] = articles["date_created"].str[:4].astype(int)-2000 else: articles["year"] = articles["year"].astype(int)-2002 articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)] articles["year_group"] = articles["year"]//5 _articles = pd.read_csv(opj(args.location,"articles.csv")) articles["article_id"] = articles.article_id.astype(int) articles = _articles.merge(articles, how="left") print(len(_articles)) print(len(articles)) articles["main_topic"] = topic_matrix.argmax(axis=1) articles["main_topic"] = articles["main_topic"].map(lambda k: topics[k]) print(articles[["title", "main_topic"]].sample(frac=1).head(10)) print(articles[["title", "main_topic"]].sample(frac=1).head(10)) all_authors = pd.read_parquet(opj(args.dataset, "articles_authors.parquet")) all_authors["article_id"] = all_authors.article_id.astype(int) n_authors = all_authors.groupby("article_id").agg(n_authors=("bai", lambda x: x.nunique())).reset_index() n_articles = len(articles) articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id") assert len(articles)==n_articles, "# of articles does not match! cannot continue" all_authors = all_authors.merge(articles, how="inner", left_on="article_id", right_on="article_id") all_authors["year_range"] = all_authors["year"]//5 n_papers = all_authors.groupby(["bai", "year_range"]).agg(n=("article_id", "count")).reset_index() filtered_authors = [] for author, n in n_papers.groupby("bai"): start = n[n["year_range"]<=1] # end = n[n["year_range"]==3] end = n[n["year_range"].isin(args.late_periods)] if len(start) and len(end): filtered_authors.append({ "author": author, "n_start": start.iloc[0]["n"], "n_end": end.iloc[0]["n"], }) filtered_authors = pd.DataFrame(filtered_authors) filtered_authors = filtered_authors[(filtered_authors["n_start"] >= args.articles_threshold) & (filtered_authors["n_end"] >= args.articles_threshold)] authors=all_authors[all_authors["bai"].isin(filtered_authors["author"])] start_authors = authors[authors["year_range"]<=1] # end_authors = authors[authors["year_range"]==3] end_authors = authors[authors["year_range"].isin(args.late_periods)] authorlist = list(authors["bai"].unique()) inv_articles = {n: i for i,n in enumerate(articles["article_id"].values)} inv_authorlist = {author: i for i, author in enumerate(authorlist)} n_authors = len(authorlist) n_clusters = topic_matrix.shape[1] n_years = articles["year"].max()+1 start = np.zeros((n_authors, n_clusters)) end = np.zeros((n_authors, n_clusters)) expertise = np.zeros((n_authors, n_clusters)) start_count = np.zeros(n_authors) end_count = np.zeros(n_authors) expertise_norm = np.zeros(n_authors) for author, _articles in start_authors.groupby("bai"): for article_id in _articles["article_id"].tolist(): start[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat start_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum() n = articles.iloc[inv_articles[article_id]]["n_authors"] expertise[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].flat expertise_norm[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].sum() for author, _articles in end_authors.groupby("bai"): for article_id in _articles["article_id"].tolist(): end[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat end_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum() authors_records = {} for author, _articles in all_authors.groupby("bai"): record = np.zeros((n_years, n_clusters)) record_count = np.zeros((n_years, n_clusters)) for article in _articles.to_dict(orient="records"): year = article["year"] article_id = article["article_id"] record[year,:] += topic_matrix[inv_articles[article_id],:].flat record_count[year] += topic_matrix[inv_articles[article_id],:].sum() authors_records[author] = { "record": record, "record_count": record_count } with open(opj(args.location, "authors_full_records.pickle"), "wb") as handle: pickle.dump(authors_records, handle, protocol=pickle.HIGHEST_PROTOCOL) ok = (start_count>=args.keywords_threshold)&(end_count>=args.keywords_threshold) cluster_names_start = [f"start_{n+1}" for n in range(n_clusters)] cluster_names_end = [f"end_{n+1}" for n in range(n_clusters)] cluster_names_expertise = [f"expertise_{n+1}" for n in range(n_clusters)] start = start[ok] end = end[ok] start_count = start_count[ok] end_count = end_count[ok] expertise = expertise[ok]/expertise_norm[ok][:,np.newaxis] start_norm = (start/start_count[:,np.newaxis]) end_norm = (end/end_count[:,np.newaxis]) print(start_norm.shape) print(end_norm.shape) print(start_norm.mean(axis=0)) print(end_norm.mean(axis=0)) aggregate = {} for i in range(n_clusters): aggregate[cluster_names_start[i]] = start[:,i] aggregate[cluster_names_end[i]] = end[:,i] aggregate[cluster_names_expertise[i]] = expertise[:,i] aggregate = pd.DataFrame(aggregate) aggregate["bai"] = [bai for i, bai in enumerate(authorlist) if ok[i]] aggregate.to_csv(opj(args.location, "aggregate.csv")) sns.heatmap(np.corrcoef(start_norm.T, end_norm.T), vmin=-0.5, vmax=0.5, cmap="RdBu") plt.show()