|
@@ -60,7 +60,15 @@ class TermExtractor:
|
|
|
|
|
|
return tokens
|
|
|
|
|
|
- def ngrams(self, lemmatize: bool = False, lemmatize_ngrams: bool = False, threads: int = 0) -> List[List[List[str]]]:
|
|
|
+ def ngrams(
|
|
|
+ self,
|
|
|
+ lemmatize: bool = False,
|
|
|
+ lemmatize_ngrams: bool = False,
|
|
|
+ split_sentences: bool = False,
|
|
|
+ threads: int = 0
|
|
|
+ ) -> List[List[List[str]]]:
|
|
|
+
|
|
|
+ self.split_sentences = bool(split_sentences)
|
|
|
self.patterns = sorted(self.patterns, key=len, reverse=True)
|
|
|
|
|
|
_ngrams = None
|
|
@@ -75,21 +83,45 @@ class TermExtractor:
|
|
|
lemmatizer = nltk.stem.WordNetLemmatizer()
|
|
|
|
|
|
if lemmatize_ngrams:
|
|
|
+ if self.split_sentences:
|
|
|
+ _ngrams = [
|
|
|
+ [
|
|
|
+ [
|
|
|
+ list(map(lemmatizer.lemmatize, ngram))
|
|
|
+ for ngram in sentence
|
|
|
+ ]
|
|
|
+ for sentence in abstract_ngrams
|
|
|
+ ]
|
|
|
+ for abstract_ngrams in _ngrams
|
|
|
+ ]
|
|
|
+ else:
|
|
|
+ _ngrams = [
|
|
|
+ [
|
|
|
+ list(map(lemmatizer.lemmatize, ngram))
|
|
|
+ for ngram in abstract_ngrams
|
|
|
+ ]
|
|
|
+ for abstract_ngrams in _ngrams
|
|
|
+ ]
|
|
|
+ else:
|
|
|
+ if self.split_sentences:
|
|
|
_ngrams = [
|
|
|
- [
|
|
|
- list(map(lemmatizer.lemmatize, ngram))
|
|
|
- for ngram in abstract_ngrams
|
|
|
+ [
|
|
|
+ [
|
|
|
+ ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
|
|
|
+ for ngram in sentence
|
|
|
+ ]
|
|
|
+ for sentence in abstract_ngrams
|
|
|
+ ]
|
|
|
+ for abstract_ngrams in _ngrams
|
|
|
]
|
|
|
- for abstract_ngrams in _ngrams
|
|
|
- ]
|
|
|
- else:
|
|
|
- _ngrams = [
|
|
|
- [
|
|
|
- ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
|
|
|
- for ngram in abstract_ngrams
|
|
|
+ else:
|
|
|
+ _ngrams = [
|
|
|
+ [
|
|
|
+ ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
|
|
|
+ for ngram in abstract_ngrams
|
|
|
+ ]
|
|
|
+ for abstract_ngrams in _ngrams
|
|
|
]
|
|
|
- for abstract_ngrams in _ngrams
|
|
|
- ]
|
|
|
|
|
|
return _ngrams
|
|
|
|
|
@@ -98,6 +130,9 @@ class TermExtractor:
|
|
|
sentences = nltk.sent_tokenize(text)
|
|
|
|
|
|
for sentence in sentences:
|
|
|
+ if self.split_sentences:
|
|
|
+ sent_matches = []
|
|
|
+
|
|
|
tokens = nltk.word_tokenize(sentence)
|
|
|
tokens = nltk.pos_tag(tokens)
|
|
|
|
|
@@ -125,10 +160,20 @@ class TermExtractor:
|
|
|
if keep == False:
|
|
|
continue
|
|
|
|
|
|
- matches.append(
|
|
|
- list(map(lambda x: x[0], tokens[i : i + length]))
|
|
|
- )
|
|
|
+ if self.split_sentences:
|
|
|
+ sent_matches.append(
|
|
|
+ list(map(lambda x: x[0], tokens[i : i + length]))
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ matches.append(
|
|
|
+ list(map(lambda x: x[0], tokens[i : i + length]))
|
|
|
+ )
|
|
|
+
|
|
|
expressions_positions.append(
|
|
|
(i, i+length)
|
|
|
)
|
|
|
+
|
|
|
+ if self.split_sentences:
|
|
|
+ matches.append(sent_matches)
|
|
|
+
|
|
|
return matches
|