terms.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
  1. import nltk
  2. import re
  3. import numpy as np
  4. import multiprocessing as mp
  5. from functools import partial
  6. from typing import List, Union
  7. class TermExtractor:
  8. DEFAULT_PATTERNS = [
  9. ["JJ.*", "NN.*"],
  10. ["JJ.*", "NN.*", "NN.*"],
  11. ["JJ.*", "NN", "CC", "NN.*"],
  12. ["JJ.*", "NN.*", "JJ.*", "NN.*"],
  13. # ["RB.*", "JJ.*", "NN.*", "NN.*"],
  14. # ["JJ.*", "NN.*", "IN", "PRP", "NN.*"],
  15. # ["JJ.*", "NN.*", "IN", "DT", "NN.*"],
  16. # ["JJ.*", "VBN", "VBG", "NN.*"],
  17. ]
  18. def __init__(self, abstracts: List[str], patterns: List[str] = None, limit_redundancy: bool = False):
  19. self.abstracts = list(map(lambda x: x.lower(), abstracts))
  20. self.patterns = self.DEFAULT_PATTERNS if patterns is None else patterns
  21. self.limit_redundancy = bool(limit_redundancy)
  22. def add_patterns(self, patterns: List[str]):
  23. self.patterns += patterns
  24. def tokens(self, lemmatize: bool = False, split_sentences: bool = False, threads: int = 0) -> Union[List[List[str]],List[List[List[str]]]]:
  25. if threads == 1:
  26. tokns = list(map(self.tokens_from_text, self.abstracts))
  27. else:
  28. pool = mp.Pool(processes=mp.cpu_count() if threads <= 0 else threads)
  29. tokns = pool.map(partial(self.tokens_from_text, split_sentences), self.abstracts)
  30. if lemmatize:
  31. lemmatizer = nltk.stem.WordNetLemmatizer()
  32. for i, doc in enumerate(tokns):
  33. tokns[i] = [list(map(lemmatizer.lemmatize, sentence)) for sentence in doc]
  34. return tokns
  35. def tokens_from_text(self, split_sentences: bool, text: str) -> Union[List[str], List[List[str]]]:
  36. stop_words = nltk.corpus.stopwords.words("english")
  37. if split_sentences:
  38. tokens = []
  39. sentences = nltk.sent_tokenize(text)
  40. for sentence in sentences:
  41. _tokens = nltk.word_tokenize(sentence)
  42. _tokens = [token for token in _tokens if token not in stop_words]
  43. tokens.append(_tokens)
  44. else:
  45. tokens = nltk.word_tokenize(text)
  46. tokens = [token for token in tokens if token not in stop_words]
  47. return tokens
  48. def ngrams(self, lemmatize: bool = False, lemmatize_ngrams: bool = False, threads: int = 0) -> List[List[List[str]]]:
  49. self.patterns = sorted(self.patterns, key=len, reverse=True)
  50. _ngrams = None
  51. if threads == 1:
  52. _ngrams = list(map(self.ngrams_from_text, self.abstracts))
  53. else:
  54. pool = mp.Pool(processes=mp.cpu_count() if threads <= 0 else threads)
  55. _ngrams = pool.map(self.ngrams_from_text, self.abstracts)
  56. if lemmatize:
  57. lemmatizer = nltk.stem.WordNetLemmatizer()
  58. if lemmatize_ngrams:
  59. _ngrams = [
  60. [
  61. list(map(lemmatizer.lemmatize, ngram))
  62. for ngram in abstract_ngrams
  63. ]
  64. for abstract_ngrams in _ngrams
  65. ]
  66. else:
  67. _ngrams = [
  68. [
  69. ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
  70. for ngram in abstract_ngrams
  71. ]
  72. for abstract_ngrams in _ngrams
  73. ]
  74. return _ngrams
  75. def ngrams_from_text(self, text: str) -> List[List[str]]:
  76. matches = []
  77. sentences = nltk.sent_tokenize(text)
  78. for sentence in sentences:
  79. tokens = nltk.word_tokenize(sentence)
  80. tokens = nltk.pos_tag(tokens)
  81. expressions_positions = []
  82. for i, t in enumerate(tokens):
  83. token, tag = t
  84. for pattern in self.patterns:
  85. length = len(pattern)
  86. tags = list(map(lambda x: x[1], tokens[i : i + length]))
  87. if len(tags) != length:
  88. continue
  89. if all([re.match(pat, tags[j]) for j, pat in enumerate(pattern)]):
  90. keep = True
  91. if self.limit_redundancy:
  92. for a, b in expressions_positions:
  93. if i >= a and i+length <= b:
  94. keep = False
  95. break
  96. if keep == False:
  97. continue
  98. matches.append(
  99. list(map(lambda x: x[0], tokens[i : i + length]))
  100. )
  101. expressions_positions.append(
  102. (i, i+length)
  103. )
  104. return matches