train_language_models.sh 1.1 KB

123456789101112131415161718192021222324252627282930313233343536
  1. #!/bin/bash
  2. programname=$0
  3. function usage {
  4. echo "usage: $programname [-h] [-t trainfolder] [-o outfolder] [-k kenlm_path] [-n ngram_size]"
  5. echo " -h display help"
  6. echo " -n size of the ngrams for the language model"
  7. echo " -t folder that contains the train files"
  8. echo " -o out folder where the estimated parameters will be saved"
  9. echo " -k path to kenlm folder"
  10. exit 1
  11. }
  12. while getopts t:o:k:n: flag
  13. do
  14. case "${flag}" in
  15. n) ngram_size=${OPTARG};;
  16. t) train_files=${OPTARG};;
  17. o) out_dirname=${OPTARG};;
  18. k) kenlm_folder=${OPTARG};;
  19. esac
  20. done
  21. case $1 in
  22. -h) usage; shift ;;
  23. esac
  24. shift
  25. echo "======================================================"
  26. echo "ngram size: $ngram_size";
  27. echo "trainfolder: $train_files";
  28. echo "outfolder: $out_dirname";
  29. echo "kenlm: $kenlm_folder";
  30. mkdir -p $out_dirname
  31. echo "================= STARTING ESTIMATION ================"
  32. for filename in $train_files/*.one_sentence_per_line; do # train_files*.one_sentence_per_line
  33. $kenlm_folder/build/bin/lmplz --discount_fallback -o $ngram_size < $filename > $out_dirname/${filename##*/}.arpa
  34. done