https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation
sudo apt-get install g++
sudo apt-get install autoconf automake libtool
sudo apt-get install pkg-config
sudo apt-get install libpng-dev
sudo apt-get install libjpeg8-dev
sudo apt-get install libtiff5-dev
sudo apt-get install zlib1g-dev
sudo apt-get install libicu-dev
sudo apt-get install libpango1.0-dev
sudo apt-get install libcairo2-dev
sudo apt-get install libleptonica-dev
sudo apt install git
git clone https://github.com/tesseract-ocr/tesseract.git tesseract-ocr
git clone https://github.com/tesseract-ocr/langdata.git langdata
git clone https://github.com/tesseract-ocr/tessdata_best.git tessdata_best
git clone https://github.com/tesseract-ocr/tessdata_fast.git tessdata_fast
cd tesseract-ocr
./autogen.sh
./configure
make
sudo make install
sudo ldconfig
make training
sudo make training-install
https://github.com/tesseract-ocr/tesseract/wiki/Data-Files
sudo apt install curl
sudo apt install default-jre
sudo apt install openjdk-11-jre-headless
sudo apt install openjdk-8-jre-headless
sudo apt install default-jdk
sudo apt install openjdk-11-jdk-headless
sudo apt install openjdk-8-jdk-headless
cd java
make ScrollView.jar
https://github.com/tesseract-ocr/tesseract/wiki/ViewerDebugging
text2image --find_fonts \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.
--min_coverage .9 \
--outputbase ./langdata/plate/plate \
|& grep raw \
| sed -e 's/ :.*/@ \\/g' \
| sed -e "s/^/ '/" \
| sed -e "s/@/'/g" >./langdata/plate/fontslist.txt
text2image --font="aakar Medium" \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.txt \
--min_coverage .9 \
--outputbase ./langdata/plate/plate.aakar_Medium.exp0
text2image --font="Abyssinica SIL" \
--fonts_dir /usr/share/fonts \
--text ./langdata/plate/plate.txt \
--min_coverage .9 \
--outputbase ./langdata/plate/plate.Abyssinica_SIL.exp0
mkdir training
cd training
mkdir -p langdata/eng
準備 langdata/eng/eng.traning_text
拷貝 langdata/radical-stroke.txt
mkdir -p tessdata/configs
拷貝 tessdata/eng.traineddata
拷貝 tessdata/configs/lstm.train
~/ocr/tesseract-ocr/src/training/tesstrain.sh --fonts_dir /usr/share/fonts \
--lang eng --linedata_only \
--noextract_font_properties --langdata_dir langdata \
--tessdata_dir tessdata \
--output_dir train
若字型不存在 (Could not find font named 'xxx xxx')
編輯 src/training/language-specific.sh 刪除不存在的字型
tif 檔產生到一半停住了, 原因是利用背景處理加速
let rem=counter%par_factor 為零時出錯
修改 src/training/tesstrain_utils.sh, 不要使用背景處理
有以下兩段, 刪除命令後面的 &
phase_I_generate_image()
generate_font_image "${font}"
phase_E_extract_features()
run_command tesseract ${img_file} ${img_file%.*} ${box_config} ${config} &
並刪除相關的 let, if, wait
combine_tessdata -e ../tessdata_best/eng.traineddata train/eng.lstm
lstmtraining --model_output model \
--continue_from train/eng.lstm \
--traineddata train/eng/eng.traineddata \
--old_traineddata ../tessdata_best/eng.traineddata \
--train_listfile train/eng.training_files.txt \
--max_iterations 3600
lstmtraining --stop_training \
--continue_from model_checkpoint \
--traineddata train/eng/eng.traineddata \
--old_traineddata ../tessdata_best/eng.traineddata \
--model_output new.traineddata
若要直接使用 tif 和 box
修改 src/training/tesstrain_utils.sh, 固定 tif, box 檔案的目錄
#TRAINING_DIR=${TMP_DIR}
TRAINING_DIR=/tmp/images
修改 src/training/tesstrain.sh, 不要自動產生圖形
#phase_I_generate_image 8
沒有留言:
張貼留言