Browse Source

term extractor

Lucas Gautheron 7 months ago
parent
commit
073526590f
2 changed files with 73 additions and 34 deletions
  1. 12 18
      AbstractSemantics/embeddings.py
  2. 61 16
      AbstractSemantics/terms.py

+ 12 - 18
AbstractSemantics/embeddings.py

@@ -15,29 +15,29 @@ class Embeddings(ABC):
         self.tokens = tokens
 
     @abstractmethod
-    def model(self):
+    def create_model(self):
         pass
 
     @abstractmethod
-    def recover(self):
+    def load_model(self):
         pass
 
 
 class GensimWord2Vec(Embeddings):
-    def __init__(self, tokens, **kwargs):
+    def __init__(self, tokens=[[]], **kwargs):
         super().__init__(tokens)
 
-    def model(
+    def create_model(
         self,
-        vector_size: int = 128,
-        window: int = 20,
+        vector_size: int = 100,
+        window: int = 5,
         min_count: int = 10,
         workers: int = 4,
         **kwargs
     ):
         from gensim.models import word2vec
 
-        model = word2vec.Word2Vec(
+        self.model = word2vec.Word2Vec(
             self.tokens,
             vector_size=vector_size,
             window=window,
@@ -45,16 +45,10 @@ class GensimWord2Vec(Embeddings):
             workers=workers,
             **kwargs
         )
-        return model
+        return self.model
 
-    def recover(self, model):
-        return model
-        # tokens = self.get_tokens(threads=threads)
-        # tokens = set(itertools.chain.from_iterable(tokens))
-
-        # embeddings = []
-
-        # for text in tokens:
-        #     embeddings.append([model.wv[token] for token in text if token in model.wv])
+    def load_model(self, path):
+        from gensim.models import word2vec
 
-        # return embeddings
+        self.model = word2vec.Word2Vec.load(path)
+        return self.model

+ 61 - 16
AbstractSemantics/terms.py

@@ -60,7 +60,15 @@ class TermExtractor:
 
         return tokens
 
-    def ngrams(self, lemmatize: bool = False, lemmatize_ngrams: bool = False, threads: int = 0) -> List[List[List[str]]]:
+    def ngrams(
+        self,
+        lemmatize: bool = False,
+        lemmatize_ngrams: bool = False,
+        split_sentences: bool = False,
+        threads: int = 0
+    ) -> List[List[List[str]]]:
+
+        self.split_sentences = bool(split_sentences)
         self.patterns = sorted(self.patterns, key=len, reverse=True)
 
         _ngrams = None
@@ -75,21 +83,45 @@ class TermExtractor:
             lemmatizer = nltk.stem.WordNetLemmatizer()
 
             if lemmatize_ngrams:
+                    if self.split_sentences:
+                        _ngrams = [
+                            [
+                                [
+                                    list(map(lemmatizer.lemmatize, ngram))
+                                    for ngram in sentence
+                                ]
+                                for sentence in abstract_ngrams
+                            ]
+                            for abstract_ngrams in _ngrams
+                        ]
+                    else:
+                        _ngrams = [
+                            [
+                                list(map(lemmatizer.lemmatize, ngram))
+                                for ngram in abstract_ngrams
+                            ]
+                            for abstract_ngrams in _ngrams
+                        ]
+            else:
+                if self.split_sentences:
                     _ngrams = [
-                    [
-                        list(map(lemmatizer.lemmatize, ngram))
-                        for ngram in abstract_ngrams
+                        [
+                            [
+                                ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
+                                for ngram in sentence
+                            ]
+                            for sentence in abstract_ngrams
+                        ]
+                        for abstract_ngrams in _ngrams
                     ]
-                    for abstract_ngrams in _ngrams
-                ]
-            else:
-                _ngrams = [
-                    [
-                        ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
-                        for ngram in abstract_ngrams
+                else:
+                    _ngrams = [
+                        [
+                            ngram if len(ngram) > 1 else [lemmatizer.lemmatize(ngram[0])]
+                            for ngram in abstract_ngrams
+                        ]
+                        for abstract_ngrams in _ngrams
                     ]
-                    for abstract_ngrams in _ngrams
-                ]
 
         return _ngrams
 
@@ -98,6 +130,9 @@ class TermExtractor:
         sentences = nltk.sent_tokenize(text)
 
         for sentence in sentences:
+            if self.split_sentences:
+                sent_matches = []
+
             tokens = nltk.word_tokenize(sentence)
             tokens = nltk.pos_tag(tokens)
             
@@ -125,10 +160,20 @@ class TermExtractor:
                         if keep == False:
                             continue
                         
-                        matches.append(
-                            list(map(lambda x: x[0], tokens[i : i + length]))
-                        )
+                        if self.split_sentences:
+                            sent_matches.append(
+                                list(map(lambda x: x[0], tokens[i : i + length]))
+                            )
+                        else:
+                            matches.append(
+                                list(map(lambda x: x[0], tokens[i : i + length]))
+                            )
+
                         expressions_positions.append(
                             (i, i+length)
                         )
+
+            if self.split_sentences:
+                matches.append(sent_matches)
+            
         return matches