123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169 |
- #!/usr/bin/env python
- # coding: utf-8
- import pandas as pd
- import numpy as np
- import networkx as nx
- from ipysigma import Sigma
- from matplotlib import pyplot as plt
- import seaborn as sns
- import pickle
- from os.path import join as opj
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("--location")
- parser.add_argument("--dataset", default="inspire-harvest/database")
- parser.add_argument("--keywords-threshold", type=int, default=200)
- parser.add_argument("--articles-threshold", type=int, default=5)
- parser.add_argument("--late-periods", nargs="+", type=int, default=[3]) # [2,3] for ACL, [3] for HEP
- args = parser.parse_args()
- topics = pd.read_csv(opj(args.location, "topics.csv"))["label"].tolist()
- topic_matrix = np.load(opj(args.location, "topics_counts.npy"))
- articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "title"]]
- articles = articles[articles["date_created"].str.len() >= 4]
- if "years" not in articles.columns:
- articles["year"] = articles["date_created"].str[:4].astype(int)-2000
- else:
- articles["year"] = articles["year"].astype(int)-2002
- articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
- articles["year_group"] = articles["year"]//5
- _articles = pd.read_csv(opj(args.location,"articles.csv"))
- articles["article_id"] = articles.article_id.astype(int)
- articles = _articles.merge(articles, how="left")
- print(len(_articles))
- print(len(articles))
- articles["main_topic"] = topic_matrix.argmax(axis=1)
- articles["main_topic"] = articles["main_topic"].map(lambda k: topics[k])
- print(articles[["title", "main_topic"]].sample(frac=1).head(10))
- print(articles[["title", "main_topic"]].sample(frac=1).head(10))
- all_authors = pd.read_parquet(opj(args.dataset, "articles_authors.parquet"))
- all_authors["article_id"] = all_authors.article_id.astype(int)
- n_authors = all_authors.groupby("article_id").agg(n_authors=("bai", lambda x: x.nunique())).reset_index()
- n_articles = len(articles)
- articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id")
- assert len(articles)==n_articles, "# of articles does not match! cannot continue"
- all_authors = all_authors.merge(articles, how="inner", left_on="article_id", right_on="article_id")
- all_authors["year_range"] = all_authors["year"]//5
- n_papers = all_authors.groupby(["bai", "year_range"]).agg(n=("article_id", "count")).reset_index()
- filtered_authors = []
- for author, n in n_papers.groupby("bai"):
- start = n[n["year_range"]<=1]
- # end = n[n["year_range"]==3]
- end = n[n["year_range"].isin(args.late_periods)]
- if len(start) and len(end):
- filtered_authors.append({
- "author": author,
- "n_start": start.iloc[0]["n"],
- "n_end": end.iloc[0]["n"],
- })
- filtered_authors = pd.DataFrame(filtered_authors)
- filtered_authors = filtered_authors[(filtered_authors["n_start"] >= args.articles_threshold) & (filtered_authors["n_end"] >= args.articles_threshold)]
- authors=all_authors[all_authors["bai"].isin(filtered_authors["author"])]
- start_authors = authors[authors["year_range"]<=1]
- # end_authors = authors[authors["year_range"]==3]
- end_authors = authors[authors["year_range"].isin(args.late_periods)]
- authorlist = list(authors["bai"].unique())
- inv_articles = {n: i for i,n in enumerate(articles["article_id"].values)}
- inv_authorlist = {author: i for i, author in enumerate(authorlist)}
- n_authors = len(authorlist)
- n_clusters = topic_matrix.shape[1]
- n_years = articles["year"].max()+1
- start = np.zeros((n_authors, n_clusters))
- end = np.zeros((n_authors, n_clusters))
- expertise = np.zeros((n_authors, n_clusters))
- start_count = np.zeros(n_authors)
- end_count = np.zeros(n_authors)
- expertise_norm = np.zeros(n_authors)
- for author, _articles in start_authors.groupby("bai"):
- for article_id in _articles["article_id"].tolist():
- start[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
- start_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()
- n = articles.iloc[inv_articles[article_id]]["n_authors"]
- expertise[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].flat
- expertise_norm[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].sum()
-
- for author, _articles in end_authors.groupby("bai"):
- for article_id in _articles["article_id"].tolist():
- end[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
- end_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()
- authors_records = {}
- for author, _articles in all_authors.groupby("bai"):
- record = np.zeros((n_years, n_clusters))
- record_count = np.zeros((n_years, n_clusters))
- for article in _articles.to_dict(orient="records"):
- year = article["year"]
- article_id = article["article_id"]
-
- record[year,:] += topic_matrix[inv_articles[article_id],:].flat
- record_count[year] += topic_matrix[inv_articles[article_id],:].sum()
- authors_records[author] = {
- "record": record,
- "record_count": record_count
- }
- with open(opj(args.location, "authors_full_records.pickle"), "wb") as handle:
- pickle.dump(authors_records, handle, protocol=pickle.HIGHEST_PROTOCOL)
- ok = (start_count>=args.keywords_threshold)&(end_count>=args.keywords_threshold)
- cluster_names_start = [f"start_{n+1}" for n in range(n_clusters)]
- cluster_names_end = [f"end_{n+1}" for n in range(n_clusters)]
- cluster_names_expertise = [f"expertise_{n+1}" for n in range(n_clusters)]
- start = start[ok]
- end = end[ok]
- start_count = start_count[ok]
- end_count = end_count[ok]
- expertise = expertise[ok]/expertise_norm[ok][:,np.newaxis]
- start_norm = (start/start_count[:,np.newaxis])
- end_norm = (end/end_count[:,np.newaxis])
- print(start_norm.shape)
- print(end_norm.shape)
- print(start_norm.mean(axis=0))
- print(end_norm.mean(axis=0))
- aggregate = {}
- for i in range(n_clusters):
- aggregate[cluster_names_start[i]] = start[:,i]
- aggregate[cluster_names_end[i]] = end[:,i]
- aggregate[cluster_names_expertise[i]] = expertise[:,i]
- aggregate = pd.DataFrame(aggregate)
- aggregate["bai"] = [bai for i, bai in enumerate(authorlist) if ok[i]]
- aggregate.to_csv(opj(args.location, "aggregate.csv"))
- sns.heatmap(np.corrcoef(start_norm.T, end_norm.T), vmin=-0.5, vmax=0.5, cmap="RdBu")
- plt.show()
|