1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import nltk
- import re
- import numpy as np
- import multiprocessing as mp
- import itertools
- from typing import List
- from abc import ABC, abstractmethod
- class Embeddings(ABC):
- def __init__(self, tokens: List[List[str]]):
- self.tokens = tokens
- @abstractmethod
- def train(self):
- pass
- @abstractmethod
- def recover(self):
- pass
- class GensimWord2Vec(Embeddings):
- def __init__(self, tokens, **kwargs):
- super().__init__(tokens)
- def train(
- self,
- vector_size: int = 128,
- window: int = 20,
- min_count: int = 10,
- threads: int = 4,
- **kwargs
- ):
- from gensim.models import word2vec
- model = word2vec.Word2Vec(
- self.tokens,
- vector_size=vector_size,
- window=window,
- min_count=min_count,
- workers=threads,
- **kwargs
- )
- return model
- def recover(self, model):
- tokens = self.get_tokens(threads=threads)
- tokens = set(itertools.chain.from_iterable(tokens))
- embeddings = []
- for text in tokens:
- embeddings.append([model.wv[token] for token in text if token in model.wv])
- return embeddings
|