123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354 |
- import nltk
- import re
- import numpy as np
- import multiprocessing as mp
- import itertools
- from typing import List
- from abc import ABC, abstractmethod
- class Embeddings(ABC):
- def __init__(self, tokens: List[List[str]]):
- self.tokens = tokens
- @abstractmethod
- def create_model(self):
- pass
- @abstractmethod
- def load_model(self):
- pass
- class GensimWord2Vec(Embeddings):
- def __init__(self, tokens=[[]], **kwargs):
- super().__init__(tokens)
- def create_model(
- self,
- vector_size: int = 100,
- window: int = 5,
- min_count: int = 10,
- workers: int = 4,
- **kwargs
- ):
- from gensim.models import word2vec
- self.model = word2vec.Word2Vec(
- self.tokens,
- vector_size=vector_size,
- window=window,
- min_count=min_count,
- workers=workers,
- **kwargs
- )
- return self.model
- def load_model(self, path):
- from gensim.models import word2vec
- self.model = word2vec.Word2Vec.load(path)
- return self.model
|