social_divide.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import nest_asyncio
  2. nest_asyncio.apply()
  3. del nest_asyncio
  4. import stan
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib
  8. import matplotlib.pyplot as plt
  9. from scipy.stats import beta
  10. import argparse
  11. matplotlib.use("pgf")
  12. matplotlib.rcParams.update(
  13. {
  14. "pgf.texsystem": "xelatex",
  15. "font.family": "serif",
  16. "font.serif": "Times New Roman",
  17. "text.usetex": True,
  18. "pgf.rcfonts": False,
  19. }
  20. )
  21. def is_susy(abstract: str):
  22. abstract = abstract.lower()
  23. return (
  24. "supersymmetry" in abstract
  25. or "supersymmetric" in abstract
  26. or "susy" in abstract
  27. )
  28. def is_hep(categories: str):
  29. return any(["-HEP" in x for x in categories])
  30. parser = argparse.ArgumentParser()
  31. parser.add_argument('action', choices=["compute", "plot"])
  32. args = parser.parse_args()
  33. if args.action == "compute":
  34. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
  35. articles["is_hep"] = articles.categories.map(is_hep)
  36. articles["is_th"] = articles.categories.map(lambda l: "Theory-HEP" in l)
  37. articles["is_exp"] = articles.categories.map(lambda l: "Experiment-HEP" in l)
  38. articles["is_pheno"] = articles.categories.map(lambda l: "Phenomenology-HEP" in l)
  39. #articles["is_susy"] = articles["abstract"].map(is_susy)
  40. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(float).fillna(0).astype(int)
  41. articles = articles[articles["is_hep"]]
  42. articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
  43. authors = pd.read_parquet("projects/inspire-harvest/database/articles_authors.parquet")[["article_id", "bai"]]
  44. authors = authors.merge(articles[["article_id", "is_th", "is_exp", "is_pheno"]], how='right')
  45. authors = authors.groupby("bai").agg(
  46. th = ('is_th', 'sum'),
  47. exp = ('is_exp', 'sum'),
  48. ph = ('is_pheno', 'sum'),
  49. total = ('article_id', 'count'),
  50. th_frac = ('is_th', 'mean'),
  51. exp_frac = ('is_exp', 'mean'),
  52. ph_frac = ('is_pheno', 'mean')
  53. )
  54. authors = authors[authors['total'] >= 3]
  55. authors = authors.sample(frac=1).head(2500)
  56. authors.to_parquet('output/social_divide_authors.parquet')
  57. data = {
  58. 'n_authors': len(authors),
  59. 'n_types': 3,
  60. 'n_articles': authors['total'].astype(int).values
  61. }
  62. counts_per_type = np.zeros((len(authors),3))
  63. for i,t in enumerate(['th', 'exp', 'ph']):
  64. counts_per_type[:,i] = authors[t].astype(int).values
  65. data['counts_per_type'] = counts_per_type.astype(int)
  66. stan_code = """
  67. data {
  68. int n_authors;
  69. int n_types;
  70. int n_articles[n_authors];
  71. int counts_per_type[n_authors,n_types];
  72. }
  73. parameters {
  74. matrix <lower=0, upper=1> [n_authors, n_types] probs;
  75. vector<lower=0>[n_types] alphas;
  76. vector<lower=0>[n_types] betas;
  77. // vector<lower=0,upper=1>[n_types] mus;
  78. // vector<lower=0.1>[n_types] etas;
  79. }
  80. // transformed parameters {
  81. // vector<lower=0>[n_types] alphas;
  82. // vector<lower=0>[n_types] betas;
  83. //
  84. // for (i in 1:n_types) {
  85. // alphas[i] = mus[i] * etas[i];
  86. // betas[i] = (1-mus[i]) * etas[i];
  87. // }
  88. // }
  89. model {
  90. for (i in 1:n_authors) {
  91. for (j in 1:n_types) {
  92. probs[i,j] ~ beta(alphas[j], betas[j]);
  93. counts_per_type[i,j] ~ binomial(n_articles[i], probs[i,j]);
  94. }
  95. }
  96. for (i in 1:n_types) {
  97. alphas[i] ~ exponential(1);
  98. betas[i] ~ exponential(1);
  99. }
  100. }
  101. """
  102. posterior = stan.build(stan_code, data = data)
  103. fit = posterior.sample(num_chains = 4, num_samples = 1000)
  104. df = fit.to_frame()
  105. df.to_parquet(f'output/social_divide_samples.parquet')
  106. else:
  107. df = pd.read_parquet(f'output/social_divide_samples.parquet')
  108. data = {'n_authors': 2500, 'n_types': 3}
  109. n = len(df)
  110. cats = ['th','exp','ph']
  111. probs = dict()
  112. alphas = dict()
  113. betas = dict()
  114. for t in np.arange(data['n_types']):
  115. probs[t] = [df[f'probs.{k}.{t+1}'].values for k in 1+np.arange(data['n_authors'])]
  116. probs[t] = np.array(probs[t])
  117. alphas[t] = df[f'alphas.{t+1}'].values
  118. betas[t] = df[f'betas.{t+1}'].values
  119. bins = np.linspace(0,1,20,True)
  120. hats = np.zeros((data['n_types'], n, len(bins)-1))
  121. for line in np.arange(n):
  122. for j in np.arange(data['n_types']):
  123. hats[j,line,:] = np.histogram(
  124. np.array([df[f'p_hat.{i+1}.{j+1}'].iloc[line] for i in np.arange(data['n_authors'])]),
  125. bins=bins
  126. )[0]
  127. fig, axes = plt.subplots(nrows=1,ncols=3,sharey=True)
  128. authors = pd.read_parquet(f'output/social_divide_authors.parquet')
  129. x = np.linspace(0,1,100,True)
  130. for i, t in enumerate([1,2,0]):
  131. ax = axes[i]
  132. # ax.hist(
  133. # probs[t].flatten(),
  134. # bins=np.linspace(0,1,20,True),
  135. # histtype="step",
  136. # color='black',
  137. # density=True,
  138. # lw=1,
  139. # label="$p_{ij}$ (modèle)"
  140. # )
  141. m = np.mean(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0)
  142. ax.scatter(
  143. (bins[:-1]+bins[1:])/2,
  144. m,
  145. color="black",
  146. s=1,
  147. label="$\hat{p}_{ij}$ (modèle)"
  148. )
  149. lower = m-np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=0.05/2)
  150. upper = np.quantile(hats[t, :, :]/data['n_authors']/(bins[1]-bins[0]), axis=0, q=1-0.05/2)-m
  151. ax.errorbar(
  152. (bins[:-1]+bins[1:])/2,
  153. m,
  154. (lower, upper),
  155. color="black",
  156. lw=1,
  157. ls="None"
  158. )
  159. ax.hist(
  160. authors[f"{cats[t]}_frac"],
  161. bins=np.linspace(0,1,20,True),
  162. histtype="step",
  163. color='blue',
  164. density=True,
  165. lw=1,
  166. linestyle='-',
  167. label="$\hat{p}_{ij}$ (données)"
  168. )
  169. ax.set_yscale('log')
  170. # for i in np.arange(100):
  171. # ax.plot(x, beta.pdf(x, alphas[t][-i], betas[t][-i]), alpha=0.01, color='blue')
  172. ax.set_xlim(0,1)
  173. cat = cats[t]
  174. ax.set_xlabel(f'$p_{{i,\\mathrm{{{cat}}}}}$')
  175. plt.legend()
  176. actors_desc = "physiciens" if args.target == "authors" else "institutions"
  177. fig.suptitle("Part des publications des "+actors_desc+" appartenant aux catégories\n``Expérience'' ($p_{exp}$), ``Phénoménologie'' ($p_{ph}$), ``Théorie'' ($p_{th}$)")
  178. plt.savefig(f"plots/social_divide_gof_{args.target}.pgf")
  179. plt.savefig(f"plots/social_divide_gof_{args.target}.pdf")