123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- import nltk
- import re
- import numpy as np
- import multiprocessing as mp
- import itertools
- from typing import List
- from abc import ABC, abstractmethod
- class Embeddings(ABC):
- def __init__(self, tokens: List[List[str]]):
- self.tokens = tokens
- @abstractmethod
- def model(self):
- pass
- @abstractmethod
- def recover(self):
- pass
- class GensimWord2Vec(Embeddings):
- def __init__(self, tokens, **kwargs):
- super().__init__(tokens)
- def model(
- self,
- vector_size: int = 128,
- window: int = 20,
- min_count: int = 10,
- workers: int = 4,
- **kwargs
- ):
- from gensim.models import word2vec
- model = word2vec.Word2Vec(
- self.tokens,
- vector_size=vector_size,
- window=window,
- min_count=min_count,
- workers=workers,
- **kwargs
- )
- return model
- def recover(self, model):
- return model
- # tokens = self.get_tokens(threads=threads)
- # tokens = set(itertools.chain.from_iterable(tokens))
- # embeddings = []
- # for text in tokens:
- # embeddings.append([model.wv[token] for token in text if token in model.wv])
- # return embeddings
|