lucasgautheron
/
adaptation_specialization_material


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193
							#!/usr/bin/env python
# coding: utf-8

import pandas as pd 
import numpy as np

import networkx as nx 
from ipysigma import Sigma

from matplotlib import pyplot as plt
import seaborn as sns

import pickle

from os.path import join as opj

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--location")
parser.add_argument("--dataset", default="inspire-harvest/database")
parser.add_argument("--keywords-threshold", type=int, default=200)
parser.add_argument("--articles-threshold", type=int, default=5)
parser.add_argument("--early-periods", nargs="+", type=int, default=[0,1]) # [2,3] for ACL, [3] for HEP
parser.add_argument("--late-periods", nargs="+", type=int, default=[3]) # [2,3] for ACL, [3] for HEP
parser.add_argument("--fla", action="store_true", help="first or last author")
args = parser.parse_args()

custom_range = "_" + "-".join(map(str, args.early_periods)) + "_" + "-".join(map(str, args.late_periods)) if (args.early_periods!=[0,1] or args.late_periods!=[3]) else ""
print(custom_range)

topics = pd.read_csv(opj(args.location, "topics.csv"))["label"].tolist()
topic_matrix = np.load(opj(args.location, "topics_counts.npy"))

articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "title"]]
articles = articles[articles["date_created"].str.len() >= 4]
if "years" not in articles.columns:
    articles["year"] = articles["date_created"].str[:4].astype(int)-2000
else:
    articles["year"] = articles["year"].astype(int)-2002

articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
articles["year_group"] = articles["year"]//5
_articles = pd.read_csv(opj(args.location,"articles.csv"))
articles["article_id"] = articles.article_id.astype(int)
articles = _articles.merge(articles, how="left")
print(len(_articles))
print(len(articles))

articles["main_topic"] = topic_matrix.argmax(axis=1)
articles["main_topic"] = articles["main_topic"].map(lambda k: topics[k])

print(articles[["title", "main_topic"]].sample(frac=1).head(10))
print(articles[["title", "main_topic"]].sample(frac=1).head(10))

all_authors = pd.read_parquet(opj(args.dataset, "articles_authors.parquet"))
all_authors["article_id"] = all_authors.article_id.astype(int)
n_authors = all_authors.groupby("article_id").agg(
    n_authors=("bai", lambda x: x.nunique()),
    first_author=("bai", "first"),
    last_author=("bai", "last")
).reset_index()

n_articles = len(articles)
articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id")
assert len(articles)==n_articles, "# of articles does not match! cannot continue"

all_authors = all_authors.merge(articles, how="inner", left_on="article_id", right_on="article_id")
all_authors["year_range"] = all_authors["year"]//5

n_papers = all_authors.groupby(["bai", "year_range"]).agg(n=("article_id", "count")).reset_index()
filtered_authors = []
for author, n in n_papers.groupby("bai"):
    start = n[n["year_range"].isin(args.early_periods)]
    # end = n[n["year_range"]==3]
    end = n[n["year_range"].isin(args.late_periods)]
    if len(start) and len(end):
        filtered_authors.append({
            "author": author,
            "n_start": start.iloc[0]["n"],
            "n_end": end.iloc[0]["n"],
        })

filtered_authors = pd.DataFrame(filtered_authors)
filtered_authors = filtered_authors[(filtered_authors["n_start"] >= args.articles_threshold) & (filtered_authors["n_end"] >= args.articles_threshold)]
authors=all_authors[all_authors["bai"].isin(filtered_authors["author"])]
start_authors = authors[authors["year_range"].isin(args.early_periods)]
# end_authors = authors[authors["year_range"]==3]
end_authors = authors[authors["year_range"].isin(args.late_periods)]

authorlist = list(authors["bai"].unique())
inv_articles = {n: i for i,n in enumerate(articles["article_id"].values)}
inv_authorlist = {author: i for i, author in enumerate(authorlist)}
n_authors = len(authorlist)

n_clusters = topic_matrix.shape[1]
n_years = articles["year"].max()+1

start = np.zeros((n_authors, n_clusters))
end = np.zeros((n_authors, n_clusters))
expertise = np.zeros((n_authors, n_clusters))

start_count = np.zeros(n_authors)
end_count = np.zeros(n_authors)
expertise_norm = np.zeros(n_authors)

for author, _articles in start_authors.groupby("bai"):
    for article in _articles.to_dict(orient="records"):
        article_id = article["article_id"]
        n = articles.iloc[inv_articles[article_id]]["n_authors"]
        expertise[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].flat
        expertise_norm[inv_authorlist[author]] += (1/n)*topic_matrix[inv_articles[article_id],:].sum()

        if args.fla and author not in [article["first_author"], article["last_author"]]:
            continue

        start[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
        start_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()
        
for author, _articles in end_authors.groupby("bai"):
    for article in _articles.to_dict(orient="records"):
        article_id = article["article_id"]
        if args.fla and author not in [article["first_author"], article["last_author"]]:
            continue

        end[inv_authorlist[author],:] += topic_matrix[inv_articles[article_id],:].flat
        end_count[inv_authorlist[author]] += topic_matrix[inv_articles[article_id],:].sum()

authors_records = {}

for author, _articles in all_authors.groupby("bai"):
    record = np.zeros((n_years, n_clusters))
    record_count = np.zeros((n_years, n_clusters))

    for article in _articles.to_dict(orient="records"):
        year = article["year"]
        article_id = article["article_id"]
        
        record[year,:] += topic_matrix[inv_articles[article_id],:].flat
        record_count[year] += topic_matrix[inv_articles[article_id],:].sum()

    authors_records[author] = {
        "record": record,
        "record_count": record_count
    }

if args.fla:
    with open(opj(args.location, f"authors_full_records_fla{custom_range}.pickle"), "wb") as handle:
        pickle.dump(authors_records, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(opj(args.location, f"authors_full_records{custom_range}.pickle"), "wb") as handle:
        pickle.dump(authors_records, handle, protocol=pickle.HIGHEST_PROTOCOL)

ok = (start_count>=args.keywords_threshold)&(end_count>=args.keywords_threshold)

cluster_names_start = [f"start_{n+1}" for n in range(n_clusters)]
cluster_names_end = [f"end_{n+1}" for n in range(n_clusters)]
cluster_names_expertise = [f"expertise_{n+1}" for n in range(n_clusters)]

start = start[ok]
end = end[ok]
start_count = start_count[ok]
end_count = end_count[ok]
expertise = expertise[ok]/expertise_norm[ok][:,np.newaxis]

start_norm = (start/start_count[:,np.newaxis])
end_norm = (end/end_count[:,np.newaxis])


print(start_norm.shape)
print(end_norm.shape)

print(start_norm.mean(axis=0))
print(end_norm.mean(axis=0))

aggregate = {}
for i in range(n_clusters):
    aggregate[cluster_names_start[i]] = start[:,i]
    aggregate[cluster_names_end[i]] = end[:,i]
    aggregate[cluster_names_expertise[i]] = expertise[:,i]

aggregate = pd.DataFrame(aggregate)
aggregate["bai"] = [bai for i, bai in enumerate(authorlist) if ok[i]]

if args.fla:
    aggregate.to_csv(opj(args.location, f"aggregate_fla{custom_range}.csv"))
else:
    aggregate.to_csv(opj(args.location, f"aggregate{custom_range}.csv"))

sns.heatmap(np.corrcoef(start_norm.T, end_norm.T), vmin=-0.5, vmax=0.5, cmap="RdBu")
plt.show()