lucasgautheron
/
adaptation_specialization_material


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
							#!/usr/bin/env python
# coding: utf-8

import pandas as pd 
import numpy as np

import networkx as nx 
from ipysigma import Sigma

from matplotlib import pyplot as plt
import seaborn as sns

import pickle

from os.path import join as opj

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--location")
parser.add_argument("--dataset", default="inspire-harvest/database")
parser.add_argument("--keywords-threshold", type=int, default=200)
parser.add_argument("--articles-threshold", type=int, default=5)
parser.add_argument("--late-periods", nargs="+", type=int, default=[3]) # [2,3] for ACL, [3] for HEP
args = parser.parse_args()

topics = pd.read_csv(opj(args.location, "topics.csv"))["label"].tolist()
topic_matrix = np.load(opj(args.location, "topics_counts.npy"))

articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "title"]]
articles = articles[articles["date_created"].str.len() >= 4]
if "years" not in articles.columns:
    articles["year"] = articles["date_created"].str[:4].astype(int)-2000
else:
    articles["year"] = articles["year"].astype(int)-2002

articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
articles["year_group"] = articles["year"]//5
_articles = pd.read_csv(opj(args.location,"articles.csv"))
articles["article_id"] = articles.article_id.astype(int)
articles = _articles.merge(articles, how="left")
print(len(_articles))
print(len(articles))

articles["main_topic"] = topic_matrix.argmax(axis=1)
articles["main_topic"] = articles["main_topic"].map(lambda k: topics[k])

print(articles[["title", "main_topic"]].sample(frac=1).head(10))
print(articles[["title", "main_topic"]].sample(frac=1).head(10))

all_authors = pd.read_parquet(opj(args.dataset, "articles_authors.parquet"))
all_authors["article_id"] = all_authors.article_id.astype(int)
n_authors = all_authors.groupby("article_id").agg(n_authors=("bai", lambda x: x.nunique())).reset_index()

n_articles = len(articles)
articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id")
assert len(articles)==n_articles, "# of articles does not match! cannot continue"

all_authors = all_authors.merge(articles, how="inner", left_on="article_id", right_on="article_id")
all_authors["year_range"] = all_authors["year"]//5

n_papers = all_authors.groupby(["bai", "year_range"]).agg(n=("article_id", "count")).reset_index()
filtered_authors = []
for author, n in n_papers.groupby("bai"):
    start = n[n["year_range"]<=1]
    # end = n[n["year_range"]==3]
    end = n[n["year_range"].isin(args.late_periods)]
    if len(start) and len(end):
        filtered_authors.append({
            "author": author,
            "n_start": start.iloc[0]["n"],
            "n_end": end.iloc[0]["n"],
        })

filtered_authors = pd.DataFrame(filtered_authors)
filtered_authors = filtered_authors[(filtered_authors["n_start"] >= args.articles_threshold) & (filtered_authors["n_end"] >= args.articles_threshold)]
authors=all_authors[all_authors["bai"].isin(filtered_authors["author"])]
start_authors = authors[authors["year_range"]<=1]
# end_authors = authors[authors["year_range"]==3]
end_authors = authors[authors["year_range"].isin(args.late_periods)]

authorlist = list(authors["bai"].unique())
inv_articles = {n: i for i,n in enumerate(articles["article_id"].values)}
inv_authorlist = {author: i for i, author in enumerate(authorlist)}
n_authors = len(authorlist)

n_clusters = topic_matrix.shape[1]
n_years = articles["year"].max()+1

start = np.zeros((n_authors, n_clusters))
end = np.zeros((n_authors, n_clusters))
expertise = np.zeros((n_authors, n_clusters))

start_count = np.zeros(n_authors)
end_count = np.zeros(n_authors)
expertise_norm = np.zeros(n_authors)

for author, _articles in start_authors.groupby("bai"):
    for article_id in _articles["article_id"].tolist():
        start[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
        start_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()

        n = articles.iloc[inv_articles[article_id]]["n_authors"]
        expertise[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].flat
        expertise_norm[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].sum()
        
for author, _articles in end_authors.groupby("bai"):
    for article_id in _articles["article_id"].tolist():
        end[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
        end_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()

authors_records = {}

for author, _articles in all_authors.groupby("bai"):
    record = np.zeros((n_years, n_clusters))
    record_count = np.zeros((n_years, n_clusters))

    for article in _articles.to_dict(orient="records"):
        year = article["year"]
        article_id = article["article_id"]
        
        record[year,:] += topic_matrix[inv_articles[article_id],:].flat
        record_count[year] += topic_matrix[inv_articles[article_id],:].sum()

    authors_records[author] = {
        "record": record,
        "record_count": record_count
    }

with open(opj(args.location, "authors_full_records.pickle"), "wb") as handle:
    pickle.dump(authors_records, handle, protocol=pickle.HIGHEST_PROTOCOL)

ok = (start_count>=args.keywords_threshold)&(end_count>=args.keywords_threshold)

cluster_names_start = [f"start_{n+1}" for n in range(n_clusters)]
cluster_names_end = [f"end_{n+1}" for n in range(n_clusters)]
cluster_names_expertise = [f"expertise_{n+1}" for n in range(n_clusters)]

start = start[ok]
end = end[ok]
start_count = start_count[ok]
end_count = end_count[ok]
expertise = expertise[ok]/expertise_norm[ok][:,np.newaxis]

start_norm = (start/start_count[:,np.newaxis])
end_norm = (end/end_count[:,np.newaxis])


print(start_norm.shape)
print(end_norm.shape)

print(start_norm.mean(axis=0))
print(end_norm.mean(axis=0))

aggregate = {}
for i in range(n_clusters):
    aggregate[cluster_names_start[i]] = start[:,i]
    aggregate[cluster_names_end[i]] = end[:,i]
    aggregate[cluster_names_expertise[i]] = expertise[:,i]

aggregate = pd.DataFrame(aggregate)
aggregate["bai"] = [bai for i, bai in enumerate(authorlist) if ok[i]]

aggregate.to_csv(opj(args.location, "aggregate.csv"))

sns.heatmap(np.corrcoef(start_norm.T, end_norm.T), vmin=-0.5, vmax=0.5, cmap="RdBu")
plt.show()