authors_sociality.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. import pandas as pd
  2. import numpy as np
  3. import argparse
  4. import networkx as nx
  5. from os.path import join as opj
  6. from itertools import combinations
  7. import pickle
  8. parser = argparse.ArgumentParser()
  9. parser.add_argument("--input")
  10. parser.add_argument("--dataset", default="inspire-harvest/database")
  11. parser.add_argument("--begin", type=int, default=2000)
  12. parser.add_argument("--end", type=int, default=2009)
  13. parser.add_argument("--categories", nargs="+", default=[], required=False)
  14. args = parser.parse_args()
  15. n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
  16. articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[
  17. ["date_created", "categories", "article_id"]
  18. ]
  19. articles = articles[articles["date_created"].str.len() >= 4]
  20. if "year" not in articles.columns:
  21. articles["year"] = articles["date_created"].str[:4].astype(int)
  22. else:
  23. articles["year"] = articles["year"].astype(int)
  24. articles = articles[(articles["year"] >= args.begin) & (articles["year"] <= args.end)]
  25. topic_matrix = np.load(opj(args.input, "topics_counts.npy"))
  26. _articles = pd.read_csv(opj(args.input, "articles.csv"))
  27. _articles["topics"] = [topic_matrix[i, :] for i in range(len(_articles))]
  28. articles["article_id"] = articles.article_id.astype(int)
  29. articles = _articles.merge(articles, how="inner").set_index("article_id")
  30. if len(args.categories):
  31. articles = articles[
  32. articles.categories.map(lambda l: any([x in l for x in args.categories]))
  33. ]
  34. articles_authors = pd.read_parquet(
  35. opj(args.dataset, "articles_authors.parquet")
  36. )
  37. articles_authors["article_id"] = articles_authors.article_id.astype(int)
  38. articles_authors = articles_authors[articles_authors["article_id"].isin(articles.index)]
  39. articles_authors_list = articles_authors.groupby("article_id").agg(
  40. authors=("bai", lambda l: "||".join(filter(None, l)))
  41. )
  42. articles = articles.merge(articles_authors_list, left_index=True, right_index=True)
  43. articles["authors"] = articles["authors"].map(lambda s: s.split("||"))
  44. G = nx.Graph()
  45. for article_id, authors in articles_authors.groupby("article_id"):
  46. if len(authors) >= 50:
  47. continue
  48. for a, b in combinations(authors["bai"].tolist(), 2):
  49. if G.has_edge(a, b):
  50. G[a][b]["weight"] = max(G[a][b]["weight"], 1 / (len(authors) - 1))
  51. else:
  52. G.add_edge(a, b, weight=1 / (len(authors) - 1))
  53. # degree = G.degree(weight="weight")
  54. # degree = {node: value for node, value in degree}
  55. selected_authors = pd.read_csv(opj(args.input, "aggregate.csv"))
  56. N = len(G.nodes)
  57. brokerage = np.zeros(N)
  58. degree = np.zeros(N)
  59. for i, bai in enumerate(G.nodes):
  60. co_authors = list(G.neighbors(bai))
  61. degree[i] = np.sum([G[bai][x]["weight"] for x in co_authors])
  62. for x,y in combinations(co_authors, 2):
  63. if not G.has_edge(x,y):
  64. common_neighbors = set(G.neighbors(x))&set(G.neighbors(y))
  65. b = G[bai][x]["weight"]*G[bai][y]["weight"]
  66. if len(common_neighbors)<=1:
  67. brokerage[i] += b
  68. pd.DataFrame({
  69. "bai": list(G.nodes), "brokerage": brokerage, "degree": degree
  70. }).to_csv(opj(args.input, "brokerage.csv"))
  71. N = len(selected_authors)
  72. pooled_resources = np.zeros((N, n_topics))
  73. for i, bai in enumerate(selected_authors["bai"].tolist()):
  74. if bai not in G.nodes:
  75. continue
  76. co_authors = list(G.neighbors(bai))
  77. for co_author in co_authors:
  78. co_author_own_pubs = articles[
  79. articles["authors"].apply(lambda l: co_author in l and bai not in l)
  80. ]
  81. if len(co_author_own_pubs) == 0:
  82. continue
  83. co_author_expertise = np.stack(co_author_own_pubs["topics"].fillna(0).values)
  84. weight = np.array(1.0 / co_author_own_pubs.authors.map(len))
  85. co_author_expertise = co_author_expertise * weight[:, np.newaxis]
  86. co_author_expertise = (
  87. co_author_expertise.sum(axis=0) / co_author_expertise.sum()
  88. )
  89. co_author_expertise = np.nan_to_num(co_author_expertise)
  90. print(bai, G[bai][co_author]["weight"], len(co_author_own_pubs), co_author_expertise.argmax(), weight.mean())
  91. pooled_resources[i, :] += G[bai][co_author]["weight"] * co_author_expertise
  92. bai = selected_authors["bai"]
  93. selected_authors["pooled_resources"] = [
  94. pooled_resources[i] for i in range(len(selected_authors))
  95. ]
  96. selected_authors[["bai", "pooled_resources"]].to_parquet(
  97. opj(args.input, "pooled_resources.parquet")
  98. )