authors_sociality.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. import pandas as pd
  2. import numpy as np
  3. import argparse
  4. import networkx as nx
  5. from os.path import join as opj
  6. from itertools import combinations
  7. import pickle
  8. parser = argparse.ArgumentParser()
  9. parser.add_argument("--input")
  10. parser.add_argument("--dataset", default="inspire-harvest/database")
  11. parser.add_argument("--begin", type=int, default=2000)
  12. parser.add_argument("--end", type=int, default=2009)
  13. parser.add_argument("--authors", default=None)
  14. parser.add_argument("--categories", nargs="+", default=[], required=False)
  15. args = parser.parse_args()
  16. n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
  17. articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[
  18. ["date_created", "categories", "article_id"]
  19. ]
  20. articles = articles[articles["date_created"].str.len() >= 4]
  21. if "year" not in articles.columns:
  22. articles["year"] = articles["date_created"].str[:4].astype(int)
  23. else:
  24. articles["year"] = articles["year"].astype(int)
  25. articles = articles[(articles["year"] >= args.begin) & (articles["year"] <= args.end)]
  26. topic_matrix = np.load(opj(args.input, "topics_counts.npy"))
  27. _articles = pd.read_csv(opj(args.input, "articles.csv"))
  28. _articles["topics"] = [topic_matrix[i, :] for i in range(len(_articles))]
  29. articles["article_id"] = articles.article_id.astype(int)
  30. articles = _articles.merge(articles, how="inner").set_index("article_id")
  31. if len(args.categories):
  32. articles = articles[
  33. articles.categories.map(lambda l: any([x in l for x in args.categories]))
  34. ]
  35. articles_authors = pd.read_parquet(
  36. opj(args.dataset, "articles_authors.parquet")
  37. )
  38. articles_authors["article_id"] = articles_authors.article_id.astype(int)
  39. articles_authors = articles_authors[articles_authors["article_id"].isin(articles.index)]
  40. articles_authors_list = articles_authors.groupby("article_id").agg(
  41. authors=("bai", lambda l: "||".join(filter(None, l)))
  42. )
  43. articles = articles.merge(articles_authors_list, left_index=True, right_index=True)
  44. articles["authors"] = articles["authors"].map(lambda s: s.split("||"))
  45. G = nx.Graph()
  46. for article_id, authors in articles_authors.groupby("article_id"):
  47. if len(authors) >= 50:
  48. continue
  49. for a, b in combinations(authors["bai"].tolist(), 2):
  50. if G.has_edge(a, b):
  51. G[a][b]["weight"] = max(G[a][b]["weight"], 1 / (len(authors) - 1))
  52. else:
  53. G.add_edge(a, b, weight=1 / (len(authors) - 1))
  54. # degree = G.degree(weight="weight")
  55. # degree = {node: value for node, value in degree}
  56. selected_authors = pd.read_csv(opj(args.input, "aggregate.csv") if authors is None else args.authors)
  57. N = len(G.nodes)
  58. brokerage = np.zeros(N)
  59. degree = np.zeros(N)
  60. for i, bai in enumerate(G.nodes):
  61. co_authors = list(G.neighbors(bai))
  62. degree[i] = np.sum([G[bai][x]["weight"] for x in co_authors])
  63. for x,y in combinations(co_authors, 2):
  64. if not G.has_edge(x,y):
  65. common_neighbors = set(G.neighbors(x))&set(G.neighbors(y))
  66. b = G[bai][x]["weight"]*G[bai][y]["weight"]
  67. if len(common_neighbors)<=1:
  68. brokerage[i] += b
  69. pd.DataFrame({
  70. "bai": list(G.nodes), "brokerage": brokerage, "degree": degree
  71. }).to_csv(opj(args.input, f"brokerage_{args.begin}_{args.end}.csv"))
  72. N = len(selected_authors)
  73. pooled_resources = np.zeros((N, n_topics))
  74. for i, bai in enumerate(selected_authors["bai"].tolist()):
  75. if bai not in G.nodes:
  76. continue
  77. co_authors = list(G.neighbors(bai))
  78. for co_author in co_authors:
  79. co_author_own_pubs = articles[
  80. articles["authors"].apply(lambda l: co_author in l and bai not in l)
  81. ]
  82. if len(co_author_own_pubs) == 0:
  83. continue
  84. co_author_expertise = np.stack(co_author_own_pubs["topics"].fillna(0).values)
  85. weight = np.array(1.0 / co_author_own_pubs.authors.map(len))
  86. co_author_expertise = co_author_expertise * weight[:, np.newaxis]
  87. co_author_expertise = (
  88. co_author_expertise.sum(axis=0) / co_author_expertise.sum()
  89. )
  90. co_author_expertise = np.nan_to_num(co_author_expertise)
  91. print(bai, G[bai][co_author]["weight"], len(co_author_own_pubs), co_author_expertise.argmax(), weight.mean())
  92. pooled_resources[i, :] += G[bai][co_author]["weight"] * co_author_expertise
  93. bai = selected_authors["bai"]
  94. selected_authors["pooled_resources"] = [
  95. pooled_resources[i] for i in range(len(selected_authors))
  96. ]
  97. if args.begin != 2000 or args.end != 2009:
  98. selected_authors[["bai", "pooled_resources"]].to_parquet(
  99. opj(args.input, f"pooled_resources_{args.begin}_{args.end}.parquet")
  100. )
  101. else:
  102. selected_authors[["bai", "pooled_resources"]].to_parquet(
  103. opj(args.input, "pooled_resources.parquet")
  104. )