terms.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import nltk
  2. import re
  3. import numpy as np
  4. import multiprocessing as mp
  5. from functools import partial
  6. from typing import List, Union
  7. class TermExtractor:
  8. DEFAULT_PATTERNS = [
  9. ["JJ.*", "NN.*"],
  10. ["JJ.*", "NN.*", "NN.*"],
  11. ["JJ.*", "NN", "CC", "NN.*"],
  12. ["JJ.*", "NN.*", "JJ.*", "NN.*"],
  13. # ["RB.*", "JJ.*", "NN.*", "NN.*"],
  14. # ["JJ.*", "NN.*", "IN", "PRP", "NN.*"],
  15. # ["JJ.*", "NN.*", "IN", "DT", "NN.*"],
  16. # ["JJ.*", "VBN", "VBG", "NN.*"],
  17. ]
  18. def __init__(self, abstracts: List[str], patterns: List[str] = None, limit_redundancy: bool = False):
  19. self.abstracts = list(map(lambda x: x.lower(), abstracts))
  20. self.patterns = self.DEFAULT_PATTERNS if patterns is None else patterns
  21. self.limit_redundancy = bool(limit_redundancy)
  22. def add_patterns(self, patterns: List[str]):
  23. self.patterns += patterns
  24. def tokens(self, split_sentences: bool = False, threads: int = 0) -> Union[List[List[str]],List[List[List[str]]]]:
  25. if threads == 1:
  26. return list(map(self.tokens_from_text, self.abstracts))
  27. else:
  28. pool = mp.Pool(processes=mp.cpu_count() if threads <= 0 else threads)
  29. return pool.map(partial(self.tokens_from_text, split_sentences), self.abstracts)
  30. def tokens_from_text(self, split_sentences: bool, text: str) -> Union[List[str], List[List[str]]]:
  31. stop_words = nltk.corpus.stopwords.words("english")
  32. if split_sentences:
  33. tokens = []
  34. sentences = nltk.sent_tokenize(text)
  35. for sentence in sentences:
  36. _tokens = nltk.word_tokenize(sentence)
  37. _tokens = [token for token in _tokens if token not in stop_words]
  38. tokens.append(_tokens)
  39. else:
  40. tokens = nltk.word_tokenize(text)
  41. tokens = [token for token in tokens if token not in stop_words]
  42. return tokens
  43. def ngrams(self, lemmatize: bool = False, lemmatize_ngrams: bool = False, threads: int = 0) -> List[List[List[str]]]:
  44. self.patterns = sorted(self.patterns, key=len, reverse=True)
  45. _ngrams = None
  46. if threads == 1:
  47. _ngrams = list(map(self.ngrams_from_text, self.abstracts))
  48. else:
  49. pool = mp.Pool(processes=mp.cpu_count() if threads <= 0 else threads)
  50. _ngrams = pool.map(self.ngrams_from_text, self.abstracts)
  51. if lemmatize:
  52. lemmatizer = nltk.stem.WordNetLemmatizer()
  53. if lemmatize_ngrams:
  54. _ngrams = [
  55. [
  56. list(map(lemmatizer.lemmatize, ngram))
  57. for ngram in abstract_ngrams
  58. ]
  59. for abstract_ngrams in _ngrams
  60. ]
  61. else:
  62. _ngrams = [
  63. [
  64. ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
  65. for ngram in abstract_ngrams
  66. ]
  67. for abstract_ngrams in _ngrams
  68. ]
  69. return _ngrams
  70. def ngrams_from_text(self, text: str) -> List[List[str]]:
  71. matches = []
  72. sentences = nltk.sent_tokenize(text)
  73. for sentence in sentences:
  74. tokens = nltk.word_tokenize(sentence)
  75. tokens = nltk.pos_tag(tokens)
  76. expressions_positions = []
  77. for i, t in enumerate(tokens):
  78. token, tag = t
  79. for pattern in self.patterns:
  80. length = len(pattern)
  81. tags = list(map(lambda x: x[1], tokens[i : i + length]))
  82. if len(tags) != length:
  83. continue
  84. if all([re.match(pat, tags[j]) for j, pat in enumerate(pattern)]):
  85. keep = True
  86. if self.limit_redundancy:
  87. for a, b in expressions_positions:
  88. if i >= a and i+length <= b:
  89. keep = False
  90. break
  91. if keep == False:
  92. continue
  93. matches.append(
  94. list(map(lambda x: x[0], tokens[i : i + length]))
  95. )
  96. expressions_positions.append(
  97. (i, i+length)
  98. )
  99. return matches