網頁

2019年1月23日 星期三

tsseract 訓練 在 Unbuntu 上

https://github.com/tesseract-ocr/tesseract/wiki/Compiling
https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation

sudo apt-get install g++
sudo apt-get install autoconf automake libtool
sudo apt-get install pkg-config
sudo apt-get install libpng-dev
sudo apt-get install libjpeg8-dev
sudo apt-get install libtiff5-dev
sudo apt-get install zlib1g-dev

sudo apt-get install libicu-dev
sudo apt-get install libpango1.0-dev
sudo apt-get install libcairo2-dev

sudo apt-get install libleptonica-dev

sudo apt install git

git clone https://github.com/tesseract-ocr/tesseract.git tesseract-ocr
git clone https://github.com/tesseract-ocr/langdata.git langdata
git clone https://github.com/tesseract-ocr/tessdata_best.git tessdata_best
git clone https://github.com/tesseract-ocr/tessdata_fast.git tessdata_fast
cd tesseract-ocr
./autogen.sh
./configure
make
sudo make install
sudo ldconfig
make training
sudo make training-install

https://github.com/tesseract-ocr/tesseract/wiki/Data-Files

sudo apt install curl
sudo apt install default-jre
sudo apt install openjdk-11-jre-headless
sudo apt install openjdk-8-jre-headless
sudo apt install default-jdk
sudo apt install openjdk-11-jdk-headless
sudo apt install openjdk-8-jdk-headless

cd java
make ScrollView.jar

https://github.com/tesseract-ocr/tesseract/wiki/ViewerDebugging

text2image --find_fonts \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.
--min_coverage .9  \
--outputbase ./langdata/plate/plate \
|& grep raw \
 | sed -e 's/ :.*/@ \\/g' \
 | sed -e "s/^/  '/" \
 | sed -e "s/@/'/g" >./langdata/plate/fontslist.txt

text2image --font="aakar Medium" \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.txt \
--min_coverage .9  \
--outputbase ./langdata/plate/plate.aakar_Medium.exp0

text2image --font="Abyssinica SIL" \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.txt \
--min_coverage .9  \
--outputbase ./langdata/plate/plate.Abyssinica_SIL.exp0

mkdir training
cd training
mkdir -p langdata/eng
準備 langdata/eng/eng.traning_text
拷貝 langdata/radical-stroke.txt
mkdir -p tessdata/configs
拷貝 tessdata/eng.traineddata
拷貝 tessdata/configs/lstm.train

~/ocr/tesseract-ocr/src/training/tesstrain.sh --fonts_dir /usr/share/fonts \
 --lang eng --linedata_only \
 --noextract_font_properties --langdata_dir langdata \
 --tessdata_dir tessdata \
 --output_dir train

若字型不存在 (Could not find font named 'xxx xxx')
編輯 src/training/language-specific.sh 刪除不存在的字型

tif 檔產生到一半停住了, 原因是利用背景處理加速
let rem=counter%par_factor 為零時出錯
修改 src/training/tesstrain_utils.sh, 不要使用背景處理
有以下兩段, 刪除命令後面的 &
phase_I_generate_image()
  generate_font_image "${font}"
phase_E_extract_features()
  run_command tesseract ${img_file} ${img_file%.*} ${box_config} ${config} &
並刪除相關的 let, if, wait

combine_tessdata -e ../tessdata_best/eng.traineddata train/eng.lstm

lstmtraining --model_output model \
 --continue_from train/eng.lstm \
 --traineddata train/eng/eng.traineddata \
 --old_traineddata ../tessdata_best/eng.traineddata \
 --train_listfile train/eng.training_files.txt \
 --max_iterations 3600

lstmtraining --stop_training \
 --continue_from model_checkpoint \
 --traineddata train/eng/eng.traineddata \
 --old_traineddata ../tessdata_best/eng.traineddata \
 --model_output new.traineddata

若要直接使用 tif 和 box
修改 src/training/tesstrain_utils.sh, 固定 tif, box 檔案的目錄
    #TRAINING_DIR=${TMP_DIR}
    TRAINING_DIR=/tmp/images
修改 src/training/tesstrain.sh, 不要自動產生圖形
#phase_I_generate_image 8

沒有留言:

張貼留言