embeddings.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. import nltk
  2. import re
  3. import numpy as np
  4. import multiprocessing as mp
  5. import itertools
  6. from typing import List
  7. from abc import ABC, abstractmethod
  8. class Embeddings(ABC):
  9. def __init__(self, tokens: List[List[str]]):
  10. self.tokens = tokens
  11. @abstractmethod
  12. def create_model(self):
  13. pass
  14. @abstractmethod
  15. def load_model(self):
  16. pass
  17. class GensimWord2Vec(Embeddings):
  18. def __init__(self, tokens=[[]], **kwargs):
  19. super().__init__(tokens)
  20. def create_model(
  21. self,
  22. vector_size: int = 100,
  23. window: int = 5,
  24. min_count: int = 10,
  25. workers: int = 4,
  26. **kwargs
  27. ):
  28. from gensim.models import word2vec
  29. self.model = word2vec.Word2Vec(
  30. self.tokens,
  31. vector_size=vector_size,
  32. window=window,
  33. min_count=min_count,
  34. workers=workers,
  35. **kwargs
  36. )
  37. return self.model
  38. def load_model(self, path):
  39. from gensim.models import word2vec
  40. self.model = word2vec.Word2Vec.load(path)
  41. return self.model