lucasgautheron
/
adaptation_specialization_material


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							import pandas as pd
import numpy as np
import argparse

import networkx as nx

from os.path import join as opj

from itertools import combinations

import pickle

parser = argparse.ArgumentParser()
parser.add_argument("--input")
parser.add_argument("--dataset", default="inspire-harvest/database")
parser.add_argument("--begin", type=int, default=2000)
parser.add_argument("--end", type=int, default=2009)
parser.add_argument("--authors", default=None)
parser.add_argument("--categories", nargs="+", default=[], required=False)
args = parser.parse_args()

n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))

articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[
    ["date_created", "categories", "article_id"]
]
articles = articles[articles["date_created"].str.len() >= 4]
if "year" not in articles.columns:
    articles["year"] = articles["date_created"].str[:4].astype(int)
else:
    articles["year"] = articles["year"].astype(int)

articles = articles[(articles["year"] >= args.begin) & (articles["year"] <= args.end)]

topic_matrix = np.load(opj(args.input, "topics_counts.npy"))
_articles = pd.read_csv(opj(args.input, "articles.csv"))
_articles["topics"] = [topic_matrix[i, :] for i in range(len(_articles))]
articles["article_id"] = articles.article_id.astype(int)
articles = _articles.merge(articles, how="inner").set_index("article_id")

if len(args.categories):
    articles = articles[
        articles.categories.map(lambda l: any([x in l for x in args.categories]))
    ]

articles_authors = pd.read_parquet(
   opj(args.dataset, "articles_authors.parquet")
)
articles_authors["article_id"] = articles_authors.article_id.astype(int)
articles_authors = articles_authors[articles_authors["article_id"].isin(articles.index)]

articles_authors_list = articles_authors.groupby("article_id").agg(
    authors=("bai", lambda l: "||".join(filter(None, l)))
)

articles = articles.merge(articles_authors_list, left_index=True, right_index=True)
articles["authors"] = articles["authors"].map(lambda s: s.split("||"))

G = nx.Graph()
for article_id, authors in articles_authors.groupby("article_id"):
    if len(authors) >= 50:
        continue

    for a, b in combinations(authors["bai"].tolist(), 2):
        if G.has_edge(a, b):
            G[a][b]["weight"] = max(G[a][b]["weight"], 1 / (len(authors) - 1))
        else:
            G.add_edge(a, b, weight=1 / (len(authors) - 1))

# degree = G.degree(weight="weight")
# degree = {node: value for node, value in degree}

selected_authors = pd.read_csv(opj(args.input, "aggregate.csv") if authors is None else args.authors)

N = len(G.nodes)
brokerage = np.zeros(N)
degree = np.zeros(N)

for i, bai in enumerate(G.nodes):    
    co_authors = list(G.neighbors(bai))
    degree[i] = np.sum([G[bai][x]["weight"] for x in co_authors])

    for x,y in combinations(co_authors, 2):
        if not G.has_edge(x,y):
            common_neighbors = set(G.neighbors(x))&set(G.neighbors(y))
            b = G[bai][x]["weight"]*G[bai][y]["weight"]
            if len(common_neighbors)<=1:
                brokerage[i] += b

pd.DataFrame({
    "bai": list(G.nodes), "brokerage": brokerage, "degree": degree
}).to_csv(opj(args.input, f"brokerage_{args.begin}_{args.end}.csv"))

N = len(selected_authors)
pooled_resources = np.zeros((N, n_topics))

for i, bai in enumerate(selected_authors["bai"].tolist()):    
    if bai not in G.nodes:
        continue

    co_authors = list(G.neighbors(bai))
        
    for co_author in co_authors:
        co_author_own_pubs = articles[
            articles["authors"].apply(lambda l: co_author in l and bai not in l)
        ]

        if len(co_author_own_pubs) == 0:
            continue

        co_author_expertise = np.stack(co_author_own_pubs["topics"].fillna(0).values)
        weight = np.array(1.0 / co_author_own_pubs.authors.map(len))
        co_author_expertise = co_author_expertise * weight[:, np.newaxis]

        co_author_expertise = (
            co_author_expertise.sum(axis=0) / co_author_expertise.sum()
        )

        co_author_expertise = np.nan_to_num(co_author_expertise)

        print(bai, G[bai][co_author]["weight"], len(co_author_own_pubs), co_author_expertise.argmax(), weight.mean())

        pooled_resources[i, :] += G[bai][co_author]["weight"] * co_author_expertise

bai = selected_authors["bai"]

selected_authors["pooled_resources"] = [
    pooled_resources[i] for i in range(len(selected_authors))
]

if args.begin != 2000 or args.end != 2009:
    selected_authors[["bai", "pooled_resources"]].to_parquet(
        opj(args.input, f"pooled_resources_{args.begin}_{args.end}.parquet")
    )
else:
    selected_authors[["bai", "pooled_resources"]].to_parquet(
        opj(args.input, "pooled_resources.parquet")
    )