123456789101112131415161718192021222324252627282930313233343536 |
- #!/bin/bash
- programname=$0
- function usage {
- echo "usage: $programname [-h] [-t trainfolder] [-o outfolder] [-k kenlm_path] [-n ngram_size]"
- echo " -h display help"
- echo " -n size of the ngrams for the language model"
- echo " -t folder that contains the train files"
- echo " -o out folder where the estimated parameters will be saved"
- echo " -k path to kenlm folder"
- exit 1
- }
- while getopts t:o:k:n: flag
- do
- case "${flag}" in
- n) ngram_size=${OPTARG};;
- t) train_files=${OPTARG};;
- o) out_dirname=${OPTARG};;
- k) kenlm_folder=${OPTARG};;
- esac
- done
- case $1 in
- -h) usage; shift ;;
- esac
- shift
- echo "======================================================"
- echo "ngram size: $ngram_size";
- echo "trainfolder: $train_files";
- echo "outfolder: $out_dirname";
- echo "kenlm: $kenlm_folder";
- mkdir -p $out_dirname
- echo "================= STARTING ESTIMATION ================"
- for filename in $train_files/*.one_sentence_per_line; do # train_files*.one_sentence_per_line
- $kenlm_folder/build/bin/lmplz --discount_fallback -o $ngram_size < $filename > $out_dirname/${filename##*/}.arpa
- done
|