import nltk import re import numpy as np import multiprocessing as mp import itertools from typing import List from abc import ABC, abstractmethod class Embeddings(ABC): def __init__(self, tokens: List[List[str]]): self.tokens = tokens @abstractmethod def model(self): pass @abstractmethod def recover(self): pass class GensimWord2Vec(Embeddings): def __init__(self, tokens, **kwargs): super().__init__(tokens) def model( self, vector_size: int = 128, window: int = 20, min_count: int = 10, workers: int = 4, **kwargs ): from gensim.models import word2vec model = word2vec.Word2Vec( self.tokens, vector_size=vector_size, window=window, min_count=min_count, workers=workers, **kwargs ) return model def recover(self, model): return model # tokens = self.get_tokens(threads=threads) # tokens = set(itertools.chain.from_iterable(tokens)) # embeddings = [] # for text in tokens: # embeddings.append([model.wv[token] for token in text if token in model.wv]) # return embeddings