rtd_analytics 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. #!/usr/bin/env python
  2. """Query DataLad Handbook analytics
  3. Usage: rtd_analytics <project-name>
  4. Needs a readthedocs account with adequate permissions. Set user/password
  5. via ENV variables RTD_USER and RTD_PASSWORD
  6. Stats are printed to STDOUT
  7. """
  8. import csv
  9. import io
  10. from os import environ
  11. import sys
  12. import mechanize
  13. class ReadTheDocs:
  14. traffic_url = \
  15. "https://readthedocs.org/dashboard/{project}/traffic-analytics/"
  16. search_url = \
  17. "https://readthedocs.org/dashboard/{project}/search-analytics/"
  18. def __init__(self, project, username, password):
  19. self._project = project
  20. self._sadata = None
  21. self._tadata = None
  22. br = mechanize.Browser()
  23. br.set_handle_robots(False)
  24. br.set_handle_redirect(mechanize.HTTPRedirectHandler)
  25. br.addheaders = [
  26. ('User-agent',
  27. 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) '
  28. 'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')
  29. ]
  30. br.open("https://readthedocs.org")
  31. # follow second link with element text matching regular expression
  32. br._factory.is_html = True
  33. base_url = br.geturl()
  34. # login
  35. br.open(base_url + "/accounts/login/")
  36. # find first form
  37. br.select_form(nr=0)
  38. br.form['login'] = username
  39. br.form['password'] = password
  40. br.submit()
  41. # keep browser running
  42. self.br = br
  43. def get_analytics_data(self, url):
  44. self.br.open(url)
  45. # the download is done via a form button. find the form
  46. self.br.select_form(nr=0)
  47. # press the button
  48. r = self.br.submit()
  49. # download CSV data as text
  50. return r.get_data().decode('utf-8')
  51. @property
  52. def search_analytics(self):
  53. if self._sadata is None:
  54. self._sadata = self.get_analytics_data(
  55. ReadTheDocs.search_url.format(project=self._project)
  56. )
  57. return self._sadata
  58. @property
  59. def traffic_analytics(self):
  60. if self._tadata is None:
  61. self._tadata = self.get_analytics_data(
  62. ReadTheDocs.traffic_url.format(project=self._project)
  63. )
  64. return self._tadata
  65. def get_aggregate_searches(rtd):
  66. """Yield a dict with queries as keys and (events, generated hits) as values
  67. """
  68. stats = dict()
  69. for d in csv.DictReader(io.StringIO(rtd.search_analytics)):
  70. queries, hits = stats.get(d['Query'], [0, 0])
  71. queries += 1
  72. hits = max(hits, int(d['Total Results']))
  73. stats[d['Query']] = (queries, hits)
  74. return stats
  75. def get_aggregate_traffic(rtd):
  76. """Yield a dict with (version, path) as keys and views as values
  77. """
  78. stats = dict()
  79. for d in csv.DictReader(io.StringIO(rtd.traffic_analytics)):
  80. try:
  81. # integer versions refer to PR-builds
  82. pr_id = int(d['Version'])
  83. d['Version'] = f'PR-{pr_id}'
  84. except ValueError:
  85. pass
  86. key = (d['Version'], d["Path"])
  87. views = stats.get(key, 0)
  88. views += int(d['Views'])
  89. stats[key] = views
  90. return stats
  91. def main(project, user=None, password=None):
  92. rtd = ReadTheDocs(
  93. project,
  94. environ.get('RTD_USER', user),
  95. environ.get('RTD_PASSWORD', password),
  96. )
  97. aggsearch = get_aggregate_searches(rtd)
  98. aggtraffic = get_aggregate_traffic(rtd)
  99. print(f'#### RTD Analytics: {project} (last 30 days)')
  100. paths = {}
  101. for k, v in aggtraffic.items():
  102. views = paths.get(k[1], 0)
  103. views += v
  104. paths[k[1]] = views
  105. versions = {}
  106. for k, v in aggtraffic.items():
  107. views = versions.get(k[0], 0)
  108. views += v
  109. versions[k[0]] = views
  110. print(f'##### 15 most popular sections ({sum(versions.values())} total page views)')
  111. print(', '.join(
  112. f'{k}({v})'.replace('.html', '').strip('/')
  113. for k, v in sorted(
  114. ((k, v) for k, v in paths.items()
  115. if k not in (
  116. '/search.html', '/r.html')),
  117. key=lambda x: x[1],
  118. reverse=True)[:15]))
  119. print(f'##### Version popularity')
  120. print(', '.join(f'{k}({v})' for k, v in sorted(versions.items(), reverse=True)))
  121. print(f'##### 10 Most popular search terms ({len(aggsearch)} unique queries)')
  122. print(', '.join(
  123. list(f'{q}({h[0]})'
  124. for q, h in sorted(((k, v) for k, v in aggsearch.items()),
  125. key=lambda x: x[1], reverse=True)
  126. )[:10]))
  127. print('##### Repeated searches with no hits')
  128. print(', '.join(
  129. f'{q}({h[0]})' for q, h in sorted(
  130. ((k, v) for k, v in aggsearch.items()
  131. if v[0] > 1 and v[1] == 0),
  132. key=lambda x: x[1],
  133. reverse=True)
  134. ))
  135. if __name__ == '__main__':
  136. main(sys.argv[1])