import pandas as pd import numpy as np import argparse import networkx as nx from os.path import join as opj from itertools import combinations import pickle parser = argparse.ArgumentParser() parser.add_argument("--input") parser.add_argument("--dataset", default="inspire-harvest/database") parser.add_argument("--begin", type=int, default=2000) parser.add_argument("--end", type=int, default=2009) parser.add_argument("--authors", default=None) parser.add_argument("--categories", nargs="+", default=[], required=False) args = parser.parse_args() n_topics = len(pd.read_csv(opj(args.input, "topics.csv"))) articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[ ["date_created", "categories", "article_id"] ] articles = articles[articles["date_created"].str.len() >= 4] if "year" not in articles.columns: articles["year"] = articles["date_created"].str[:4].astype(int) else: articles["year"] = articles["year"].astype(int) articles = articles[(articles["year"] >= args.begin) & (articles["year"] <= args.end)] topic_matrix = np.load(opj(args.input, "topics_counts.npy")) _articles = pd.read_csv(opj(args.input, "articles.csv")) _articles["topics"] = [topic_matrix[i, :] for i in range(len(_articles))] articles["article_id"] = articles.article_id.astype(int) articles = _articles.merge(articles, how="inner").set_index("article_id") if len(args.categories): articles = articles[ articles.categories.map(lambda l: any([x in l for x in args.categories])) ] articles_authors = pd.read_parquet( opj(args.dataset, "articles_authors.parquet") ) articles_authors["article_id"] = articles_authors.article_id.astype(int) articles_authors = articles_authors[articles_authors["article_id"].isin(articles.index)] articles_authors_list = articles_authors.groupby("article_id").agg( authors=("bai", lambda l: "||".join(filter(None, l))) ) articles = articles.merge(articles_authors_list, left_index=True, right_index=True) articles["authors"] = articles["authors"].map(lambda s: s.split("||")) G = nx.Graph() for article_id, authors in articles_authors.groupby("article_id"): if len(authors) >= 50: continue for a, b in combinations(authors["bai"].tolist(), 2): if G.has_edge(a, b): G[a][b]["weight"] = max(G[a][b]["weight"], 1 / (len(authors) - 1)) else: G.add_edge(a, b, weight=1 / (len(authors) - 1)) # degree = G.degree(weight="weight") # degree = {node: value for node, value in degree} selected_authors = pd.read_csv(opj(args.input, "aggregate.csv") if authors is None else args.authors) N = len(G.nodes) brokerage = np.zeros(N) degree = np.zeros(N) for i, bai in enumerate(G.nodes): co_authors = list(G.neighbors(bai)) degree[i] = np.sum([G[bai][x]["weight"] for x in co_authors]) for x,y in combinations(co_authors, 2): if not G.has_edge(x,y): common_neighbors = set(G.neighbors(x))&set(G.neighbors(y)) b = G[bai][x]["weight"]*G[bai][y]["weight"] if len(common_neighbors)<=1: brokerage[i] += b pd.DataFrame({ "bai": list(G.nodes), "brokerage": brokerage, "degree": degree }).to_csv(opj(args.input, f"brokerage_{args.begin}_{args.end}.csv")) N = len(selected_authors) pooled_resources = np.zeros((N, n_topics)) for i, bai in enumerate(selected_authors["bai"].tolist()): if bai not in G.nodes: continue co_authors = list(G.neighbors(bai)) for co_author in co_authors: co_author_own_pubs = articles[ articles["authors"].apply(lambda l: co_author in l and bai not in l) ] if len(co_author_own_pubs) == 0: continue co_author_expertise = np.stack(co_author_own_pubs["topics"].fillna(0).values) weight = np.array(1.0 / co_author_own_pubs.authors.map(len)) co_author_expertise = co_author_expertise * weight[:, np.newaxis] co_author_expertise = ( co_author_expertise.sum(axis=0) / co_author_expertise.sum() ) co_author_expertise = np.nan_to_num(co_author_expertise) print(bai, G[bai][co_author]["weight"], len(co_author_own_pubs), co_author_expertise.argmax(), weight.mean()) pooled_resources[i, :] += G[bai][co_author]["weight"] * co_author_expertise bai = selected_authors["bai"] selected_authors["pooled_resources"] = [ pooled_resources[i] for i in range(len(selected_authors)) ] if args.begin != 2000 or args.end != 2009: selected_authors[["bai", "pooled_resources"]].to_parquet( opj(args.input, f"pooled_resources_{args.begin}_{args.end}.parquet") ) else: selected_authors[["bai", "pooled_resources"]].to_parquet( opj(args.input, "pooled_resources.parquet") )