Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

social_divide.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import nest_asyncio
  2. nest_asyncio.apply()
  3. del nest_asyncio
  4. import stan
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib
  8. import matplotlib.pyplot as plt
  9. from scipy.stats import beta
  10. import argparse
  11. matplotlib.use("pgf")
  12. matplotlib.rcParams.update(
  13. {
  14. "pgf.texsystem": "xelatex",
  15. "font.family": "serif",
  16. "font.serif": "Times New Roman",
  17. "text.usetex": True,
  18. "pgf.rcfonts": False,
  19. }
  20. )
  21. def is_susy(abstract: str):
  22. abstract = abstract.lower()
  23. return (
  24. "supersymmetry" in abstract
  25. or "supersymmetric" in abstract
  26. or "susy" in abstract
  27. )
  28. def is_hep(categories: str):
  29. return any(["-HEP" in x for x in categories])
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument('action', choices=["compute", "plot"])
  32. args = parser.parse_args()
  33. if args.action == "compute":
  34. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
  35. articles["is_hep"] = articles.categories.map(is_hep)
  36. articles["is_th"] = articles.categories.map(lambda l: "Theory-HEP" in l)
  37. articles["is_exp"] = articles.categories.map(lambda l: "Experiment-HEP" in l)
  38. articles["is_pheno"] = articles.categories.map(lambda l: "Phenomenology-HEP" in l)
  39. #articles["is_susy"] = articles["abstract"].map(is_susy)
  40. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(float).fillna(0).astype(int)
  41. articles = articles[articles["is_hep"]]
  42. articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
  43. authors = pd.read_parquet("projects/inspire-harvest/database/articles_authors.parquet")[["article_id", "bai"]]
  44. authors = authors.merge(articles[["article_id", "is_th", "is_exp", "is_pheno"]], how='right')
  45. authors = authors.groupby("bai").agg(
  46. th = ('is_th', 'sum'),
  47. exp = ('is_exp', 'sum'),
  48. ph = ('is_pheno', 'sum'),
  49. total = ('article_id', 'count'),
  50. th_frac = ('is_th', 'mean'),
  51. exp_frac = ('is_exp', 'mean'),
  52. ph_frac = ('is_pheno', 'mean')
  53. )
  54. authors = authors[authors['total'] >= 3]
  55. authors = authors.sample(frac=1).head(2500)
  56. authors.to_parquet('output/social_divide_authors.parquet')
  57. data = {
  58. 'n_authors': len(authors),
  59. 'n_types': 3,
  60. 'n_articles': authors['total'].astype(int).values
  61. }
  62. counts_per_type = np.zeros((len(authors),3))
  63. for i,t in enumerate(['th', 'exp', 'ph']):
  64. counts_per_type[:,i] = authors[t].astype(int).values
  65. data['counts_per_type'] = counts_per_type.astype(int)
  66. stan_code = """
  67. data {
  68. int n_authors;
  69. int n_types;
  70. int n_articles[n_authors];
  71. int counts_per_type[n_authors,n_types];
  72. }
  73. parameters {
  74. matrix <lower=0, upper=1> [n_authors, n_types] probs;
  75. vector<lower=0>[n_types] alphas;
  76. vector<lower=0>[n_types] betas;
  77. // vector<lower=0,upper=1>[n_types] mus;
  78. // vector<lower=0.1>[n_types] etas;
  79. }
  80. // transformed parameters {
  81. // vector<lower=0>[n_types] alphas;
  82. // vector<lower=0>[n_types] betas;
  83. //
  84. // for (i in 1:n_types) {
  85. // alphas[i] = mus[i] * etas[i];
  86. // betas[i] = (1-mus[i]) * etas[i];
  87. // }
  88. // }
  89. model {
  90. for (i in 1:n_authors) {
  91. for (j in 1:n_types) {
  92. probs[i,j] ~ beta(alphas[j], betas[j]);
  93. counts_per_type[i,j] ~ binomial(n_articles[i], probs[i,j]);
  94. }
  95. }
  96. for (i in 1:n_types) {
  97. alphas[i] ~ exponential(1);
  98. betas[i] ~ exponential(1);
  99. }
  100. }
  101. """
  102. posterior = stan.build(stan_code, data = data)
  103. fit = posterior.sample(num_chains = 4, num_samples = 1000)
  104. df = fit.to_frame()
  105. df.to_parquet(f'output/social_divide_samples.parquet')
  106. else:
  107. df = pd.read_parquet(f'output/social_divide_samples.parquet')
  108. data = {'n_authors': 2500, 'n_types': 3}
  109. n = len(df)
  110. cats = ['th','exp','ph']
  111. probs = dict()
  112. alphas = dict()
  113. betas = dict()
  114. for t in np.arange(data['n_types']):
  115. probs[t] = [df[f'probs.{k}.{t+1}'].values for k in 1+np.arange(data['n_authors'])]
  116. probs[t] = np.array(probs[t])
  117. alphas[t] = df[f'alphas.{t+1}'].values
  118. betas[t] = df[f'betas.{t+1}'].values
  119. bins = np.linspace(0,1,20,True)
  120. hats = np.zeros((data['n_types'], n, len(bins)-1))
  121. for line in np.arange(n):
  122. for j in np.arange(data['n_types']):
  123. hats[j,line,:] = np.histogram(
  124. np.array([df[f'p_hat.{i+1}.{j+1}'].iloc[line] for i in np.arange(data['n_authors'])]),
  125. bins=bins
  126. )[0]
  127. fig, axes = plt.subplots(nrows=1,ncols=3,sharey=True)
  128. authors = pd.read_parquet(f'output/social_divide_authors.parquet')
  129. x = np.linspace(0,1,100,True)
  130. for i, t in enumerate([1,2,0]):
  131. ax = axes[i]
  132. # ax.hist(
  133. # probs[t].flatten(),
  134. # bins=np.linspace(0,1,20,True),
  135. # histtype="step",
  136. # color='black',
  137. # density=True,
  138. # lw=1,
  139. # label="$p_{ij}$ (modèle)"
  140. # )
  141. m = np.mean(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0)
  142. ax.scatter(
  143. (bins[:-1]+bins[1:])/2,
  144. m,
  145. color="black",
  146. s=1,
  147. label="$\hat{p}_{ij}$ (modèle)"
  148. )
  149. lower = m-np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=0.05/2)
  150. upper = np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=1-0.05/2)-m
  151. ax.errorbar(
  152. (bins[:-1]+bins[1:])/2,
  153. m,
  154. (lower, upper),
  155. color="black",
  156. lw=1,
  157. ls="None"
  158. )
  159. ax.hist(
  160. authors[f"{cats[t]}_frac"],
  161. bins=np.linspace(0,1,20,True),
  162. histtype="step",
  163. color='blue',
  164. density=True,
  165. lw=1,
  166. linestyle='-',
  167. label="$\hat{p}_{ij}$ (données)"
  168. )
  169. ax.set_yscale('log')
  170. # for i in np.arange(100):
  171. # ax.plot(x, beta.pdf(x, alphas[t][-i], betas[t][-i]), alpha=0.01, color='blue')
  172. ax.set_xlim(0,1)
  173. cat = cats[t]
  174. ax.set_xlabel(f'$p_{{i,\\mathrm{{{cat}}}}}$')
  175. plt.legend()
  176. actors_desc = "physiciens" if args.target == "authors" else "institutions"
  177. fig.suptitle("Part des publications des "+actors_desc+" appartenant aux catégories\n``Expérience'' ($p_{exp}$), ``Phénoménologie'' ($p_{ph}$), ``Théorie'' ($p_{th}$)")
  178. plt.savefig(f"plots/social_divide_gof_{args.target}.pgf")
  179. plt.savefig(f"plots/social_divide_gof_{args.target}.pdf")