123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- import pandas as pd
- import numpy as np
- import argparse
- import networkx as nx
- from os.path import join as opj
- from itertools import combinations
- import pickle
- parser = argparse.ArgumentParser()
- parser.add_argument("--input")
- parser.add_argument("--dataset", default="inspire-harvest/database")
- parser.add_argument("--begin", type=int, default=2000)
- parser.add_argument("--end", type=int, default=2009)
- parser.add_argument("--authors", default=None)
- parser.add_argument("--categories", nargs="+", default=[], required=False)
- args = parser.parse_args()
- n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
- articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[
- ["date_created", "categories", "article_id"]
- ]
- articles = articles[articles["date_created"].str.len() >= 4]
- if "year" not in articles.columns:
- articles["year"] = articles["date_created"].str[:4].astype(int)
- else:
- articles["year"] = articles["year"].astype(int)
- articles = articles[(articles["year"] >= args.begin) & (articles["year"] <= args.end)]
- topic_matrix = np.load(opj(args.input, "topics_counts.npy"))
- _articles = pd.read_csv(opj(args.input, "articles.csv"))
- _articles["topics"] = [topic_matrix[i, :] for i in range(len(_articles))]
- articles["article_id"] = articles.article_id.astype(int)
- articles = _articles.merge(articles, how="inner").set_index("article_id")
- if len(args.categories):
- articles = articles[
- articles.categories.map(lambda l: any([x in l for x in args.categories]))
- ]
- articles_authors = pd.read_parquet(
- opj(args.dataset, "articles_authors.parquet")
- )
- articles_authors["article_id"] = articles_authors.article_id.astype(int)
- articles_authors = articles_authors[articles_authors["article_id"].isin(articles.index)]
- articles_authors_list = articles_authors.groupby("article_id").agg(
- authors=("bai", lambda l: "||".join(filter(None, l)))
- )
- articles = articles.merge(articles_authors_list, left_index=True, right_index=True)
- articles["authors"] = articles["authors"].map(lambda s: s.split("||"))
- G = nx.Graph()
- for article_id, authors in articles_authors.groupby("article_id"):
- if len(authors) >= 50:
- continue
- for a, b in combinations(authors["bai"].tolist(), 2):
- if G.has_edge(a, b):
- G[a][b]["weight"] = max(G[a][b]["weight"], 1 / (len(authors) - 1))
- else:
- G.add_edge(a, b, weight=1 / (len(authors) - 1))
- # degree = G.degree(weight="weight")
- # degree = {node: value for node, value in degree}
- selected_authors = pd.read_csv(opj(args.input, "aggregate.csv") if authors is None else args.authors)
- N = len(G.nodes)
- brokerage = np.zeros(N)
- degree = np.zeros(N)
- for i, bai in enumerate(G.nodes):
- co_authors = list(G.neighbors(bai))
- degree[i] = np.sum([G[bai][x]["weight"] for x in co_authors])
- for x,y in combinations(co_authors, 2):
- if not G.has_edge(x,y):
- common_neighbors = set(G.neighbors(x))&set(G.neighbors(y))
- b = G[bai][x]["weight"]*G[bai][y]["weight"]
- if len(common_neighbors)<=1:
- brokerage[i] += b
- pd.DataFrame({
- "bai": list(G.nodes), "brokerage": brokerage, "degree": degree
- }).to_csv(opj(args.input, f"brokerage_{args.begin}_{args.end}.csv"))
- N = len(selected_authors)
- pooled_resources = np.zeros((N, n_topics))
- for i, bai in enumerate(selected_authors["bai"].tolist()):
- if bai not in G.nodes:
- continue
- co_authors = list(G.neighbors(bai))
-
- for co_author in co_authors:
- co_author_own_pubs = articles[
- articles["authors"].apply(lambda l: co_author in l and bai not in l)
- ]
- if len(co_author_own_pubs) == 0:
- continue
- co_author_expertise = np.stack(co_author_own_pubs["topics"].fillna(0).values)
- weight = np.array(1.0 / co_author_own_pubs.authors.map(len))
- co_author_expertise = co_author_expertise * weight[:, np.newaxis]
- co_author_expertise = (
- co_author_expertise.sum(axis=0) / co_author_expertise.sum()
- )
- co_author_expertise = np.nan_to_num(co_author_expertise)
- print(bai, G[bai][co_author]["weight"], len(co_author_own_pubs), co_author_expertise.argmax(), weight.mean())
- pooled_resources[i, :] += G[bai][co_author]["weight"] * co_author_expertise
- bai = selected_authors["bai"]
- selected_authors["pooled_resources"] = [
- pooled_resources[i] for i in range(len(selected_authors))
- ]
- if args.begin != 2000 or args.end != 2009:
- selected_authors[["bai", "pooled_resources"]].to_parquet(
- opj(args.input, f"pooled_resources_{args.begin}_{args.end}.parquet")
- )
- else:
- selected_authors[["bai", "pooled_resources"]].to_parquet(
- opj(args.input, "pooled_resources.parquet")
- )
|