Training tesseract 4, preparing glyphs
Post ReplyTraining tesseract 4, preparing glyphsPosted: Friday, November 27, 2020 [22:03:44] - 1
There are many ways to create a glyph and box files to train tesseract. One of the main obstacles is getting glyphs coordinates right for the box file. There are third party programs for that but it is very painful to edit file even then. Simpler way is to generate tiff file with all the glyphs while properly recording coordinates. First we need to collect text from either scanned documents or images. In our case client only provided receipts pictures taken by a smartphone. Let's collect as mach data as possible from collection of receipts. For that we need to iterate through the all receipt images and convert them to a similar size Then get every character as a separate image using opencv. #!/usr/bin/perl use strict; $ENV{'PATH'} = '/opt/local/lib:/usr/local/include:/opt/local:/opencv:/opt/local/sbin:/opt/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin'; my $image_dir='/your_images_dir_location'; ## FIRST WE NEED TO REMOVE BACKGROUND my @fls = glob("$image_dir/*.jpg"); ## RAW RECEIPTS LOCATION foreach my $f (@fls) {my $nnme=$f; if(-f '$image_dir/receipt.tif') {unlink '$image_dir/receipt.tif';} `python $prog_dir/read.py -i $f`; if(-f '$image_dir/receipt.tif') { `convert -density 300 -type grayscale $image_dir/receipt.tif -level 27%,73%,.6 -blur 1x2 -sharpen 0x3 +repage -blur 1x2 -blur 1x2 $image_dir/$nnme`; } ## IMAGE BUILT } ## FOREACH IMAGE FILE END ## NOW WE CREATE SEPARATE IMAGE FOR EVERY CHARACTER my @rline=(); my @fls = glob("$image_dir/*.tif"); `rm -rf $prog_dir/temp/*`; open(TXT,">$prog_dir/sizes.txt"); ## LOG ALL IMAGES SIZES foreach my $f (@fls) { `rm -rf $prog_dir/temp1/*`; if($f =~ m/\.tif\.tif/) {next;} my $imnm=$f; $imnm =~ s/\.done//; print "$f\t$imnm\n"; my $res2 = `python $prog_dir/find.duo2.py -i $f`; @rline = split(/\n/,$res2); foreach my $l (@rline) {$l =~ tr/ //s; my($imgt,$x,$y,$w,$h) = split(/ /,$l); if($w > 300 && $h > 100) {$skew{$imgt}=1;} ## PROCESS ONLY BIG PARTS } ## FOREACH END my $tangls=0; foreach my $fnm (sort num keys(%skew)) {$imgtest++; my $rfile='$prog_dir/temp1/'.$fnm.'.tif'; my $rota = `python $prog_dir/rotate.duo.n.py -i $rfile`; unless($rota) {$imgtest--; if($rota < 0) {$rota =~ s/\-//;} else {$rota='-'.$rota;} $tangls += $rota; } ## FOREACH IMAGE SKEW my $ang=0; unless($tangls == 0) {my $ang = $tangls / $totimg; unless($excl{$imnm}) { `convert -density 300 -type grayscale $f$frotate -blur 1x2 -sharpen 0x3 +repage $f.tif`; } else { `convert -density 300 -type grayscale $f$frotate -blur 1x2 -sharpen 0x3 +repage $f.tif`; } if(-f '$prog_dir/receipt.tif') {unlink '$prog_dir/receipt.tif';} my $res = `python $prog_dir/train.py -i $f.tif`; $res =~ s#No count defined##; my @rline = split(/\n/,$res); foreach my $l (@rline) {$l =~ tr/ //s; `mv $prog_dir/temp/$imgt.tif $prog_dir/temp/$imnm.$imgt.tif`; print TXT "$imnm.$imgt.tif\t$x\t$y\t$w\t$h\n"; } ## FOREACH LINE END } ## FOREACH IMAGE FILE END close(TXT); sub num {$a <=> $b;} Next, HTML form to add characters to the code. |
Training tesseract 4 - indexing glyphsPosted: Friday, November 27, 2020 [22:51:23] - 2
Now we need to index individual character tif files. #!/usr/bin/perl use strict; our %in=(); my $web_dir='your_web_dir'; ## APACHE WEB DIRECTORY eval{require'cgi-lib.pl'}; if(-f "$prog_dir/sizes.txt") { my $d = `cat $prog_dir/sizes.txt`; foreach my $l (@all) { my($im,$x,$y,$w,$h) = split(/\t/,$l); unless(-f "$prog_dir/fonts/$im") {next;} $new{$im}="\t$im\t$w\t$h"; } ## FOREACH END } ## END IF FILE PRESENT my $done=0; for my $nmb (1..2000) { ## FOR 2000 POSSIBLE IMAGES LOOP unless($in{"let$nmb"} eq '0') { unless($in{"let$nmb"}) {next;} } my $let=$in{"let$nmb"}; my $im=$in{"im$nmb"}; my $w=$in{"w$nmb"}; my $h=$in{"h$nmb"}; unless($im && $w && $h) {next;} $done++; } ## FOR END if($done > 0) { ## NEW RECORDS open(TXT,">$prog_dir/box.prepare.txt"); foreach my $l (sort keys(%new)) { unless($new{$l}) {next;} print TXT "$new{$l}\n"; } close(TXT); } ## END NEW RECORDS my $noim=0; my $pg = "<p>Processed $done records</p>\n<form method=\"post\" action=\"/box.cgi\">\n<table border=0 cellpadding=4 cellspacing=1>"; if(-f "$prog_dir/box.prepare.txt") { my $d = `cat $prog_dir/box.prepare.txt`; my @all = split(/\n/,$d); foreach my $l (@all) { #10.1.tif my($let,$im,$w,$h) = split(/\t/,$l); unless(-f "$prog_dir/fonts/$im") {next;} #my $sh=''; $new{$im}="$let\t$im\t$w\t$h"; } ## FOREACH END } ## END IF RESULT TEXT FILE PRESENT my @alphbt = qw(q w e r t y u i o p a s d f g h j k l z x c v b n m Q W E R T Y U I O P A S D F G H J K L Z X C V B N M 1 2 3 4 5 6 7 8 9 0); my %missingalph=(); foreach my $ltr (@alphbt) {$missingalph{$ltr}=1;} foreach my $l (sort keys(%new)) { my($let,$im,$w,$h) = split(/\t/,$new{$l}); my $imh=$im; delete $missingalph{$let}; if($h > 70) {$rh = 30;} else {$rh=20;} if($h > 70) {$rh = 30;} else {$rh=20;} unless(-f "$web_dir/fonts/$imh") { `/opt/local/bin/convert $prog_dir/fonts/$im -resize x$rh -colors 2 -colorspace gray -normalize +repage -quality 50 -flatten -colorspace gray +repage $web_dir/fonts/$imh`; } $noim++; if($noim < 2) {$nbsp=' '} $nospr++; if($nospr == 25) {$nospr=0; my $hidden = "<input type=\"hidden\" name=\"w$noim\" value=\"$w\"><input type=\"hidden\" name=\"h$noim\" value=\"$h\"><input type=\"hidden\" name=\"im$noim\" value=\"$im\">"; $pg.= "<tr><td>$noim\.</td><td><input type=text name=\"let$noim\" value=\"$let\"size=\"3\"></td><td>$nbsp</td><td><img src=\"/fonts/$imh\" border=0 height=\"30\"></td><td>$im</td></tr>$hidden$sprsbm\n"; } ## FOREACH LINE END my $pg1=''; foreach my $miss (sort keys(%missingalph)) {$pg1 .= "$miss\n";} if($pg1) {chomp $pg1; } $pg .= "<tr class=\"gr\"><td colspan=6 align=right><input type=\"submit\" value=\"Submit\"></td></tr>\n</table>\n</form>$pg1\n</body>\n</html>\n"; printres('Processed',$pg); ## PRINT PAGE sub printres { my($title,$page) = @_; print "Content-type: text/html\n\n"; print <<EOH; <!doctype html>\n<html>\n<head>\n<meta charset="utf-8">$js <title>$title</title>\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<link rel="stylesheet" href="all.css" media="screen"> <link href="fonts.exo.css" rel="stylesheet"><link href="fonts.exo2.css" rel="stylesheet"> <link rel="preconnect" href="fonts.gstatic.com"> <link href="https://fonts.googleapis.com/css2?family=Jura&display=swap" rel="stylesheet"> <style>.gr{background-color:#f0eff1!important;}.gr td{background-color:#f0eff1!important;border: 1px solid #8b8a8c;}input[type="text"]{font-size:25px;height:30px;font-family: 'Jura', sans-serif;}td{padding:3px 8px;}input[type=submit]{margin:0!important;}</style> </head> <body> <div id="content"> <h1>$title</h1> $page </div> </body> </html> EOH exit(0); } ## END SUB PRINT RESULT result page: using resulting page appropriate character added to the corresponding tif file. |
RE: Training tesseract 4 - creating box and tiff filesPosted: Friday, November 27, 2020 [23:06:44] - 3
In Terminal run the following: #!/usr/bin/perl use strict; ## Creating tiff file with glyps and corresponding box file my $lang='eng2'; ## Font language my $fontname='shipreceipts'; ## Font name my $expfnt='exp0'; my $filenames="$lang\.$fontname\.$expfnt"; ## Properly formatted font file name my %create=(); my $prog_dir='/your_program_dir_location'; my $d = `cat $prog_dir/box.prepare.txt`; foreach my $ln (@all) { my($l,$f,$w,$h) = split(/\t/,$ln); unless($l eq '0') { unless($l) {next;} } $create{$f}=$ln; if(($ww + $w) > 2500) {$wh += ($maxlh + 20); if($maxlh < $h) {$maxlh=$h;} $ww += $w; unless(-f "$prog_dir/temp1/$f") { `convert -density 300 $prog_dir/fonts/$f -white-threshold 90% -colors 2 -colorspace gray -normalize +repage $prog_dir/temp1/$f`; } } ## FOREACH END $wh += $maxlh * 2; ## Create white background file `convert -size 2550x$wh xc:white -fill white $prog_dir/temp1/back1.jpg`; `convert -density 300 $prog_dir/temp1/back1.jpg $prog_dir/temp1/back.tif`; `rm $prog_dir/temp1/back1.jpg`; my $imheight=$wh; $ww=20; open(TXB,">$prog_dir/$filenames.box"); ## Box file foreach my $f (keys (%create)) { my($l,$f,$w,$h) = split(/\t/,$create{$f}); if(($ww + $w) > 2500) { $wh += ($maxlh + 20); print TXB $saved; ## Print newline according to tesseract 4 specs } if($maxlh < $h) {$maxlh=$h;} ## Add character to the file using ImageMagick `composite -geometry +$ww+$wh $prog_dir/temp1/$f $prog_dir/temp1/back.tif $prog_dir/temp1/back.tif`; my $bot=$wh+$h; my $right=$ww+$w; my $toppos = $imheight - $wh; my $lftps=$ww-1; print TXB "$l $lftps $bot $right $toppos 0\n"; print "$l $lftps $bot $right $toppos 0\t$l,$f,$w,$h\n"; $ww += $w; $cntb++; } ## FOREACH FILE END close(TXB); `cp $prog_dir/temp1/back.tif $prog_dir/$filenames.tif`; program will create tiff file: and box file similar to this: I 19 758 57 823 0 3 65 760 99 823 0 e 107 760 150 823 0 A 158 759 192 823 0 7 200 760 234 823 0 p 242 753 278 823 0 a 286 759 331 823 0 B 339 759 373 823 0 A 381 759 414 823 0 T 422 758 455 823 0 2 463 758 497 823 0 ... 2 1221 60 1257 124 0 0 1265 57 1300 124 0 0 1308 60 1343 124 0 a 1351 60 1395 124 0 g 1403 59 1438 124 0 P 1446 59 1481 124 0 T 1489 61 1522 124 0 e 1530 59 1573 124 0 b 1581 42 1625 124 0 all that needs to be done is to create a eng2.traineddata file for tesseract |
RE: Training tesseract 4 - creating traineddata filePosted: Friday, November 27, 2020 [23:20:09] - 4
To create a traineddata file move both tiff and box files to a separate folder. CD into this folder and run the following commands (one command per line): cat eng2.shipreceipts.exp0.box.tr > eng2.shipreceipts.exp.tr unicharset_extractor eng2.shipreceipts.exp0.box shapeclustering -F eng2.font_properties -U unicharset eng2.shipreceipts.exp.tr mftraining -F eng2.font_properties -U unicharset -O eng2.unicharset eng2.shipreceipts.exp.tr cntraining eng2.shipreceipts.exp.tr mv inttemp eng2.inttemp mv normproto eng2.normproto mv pffmtable eng2.pffmtable mv shapetable eng2.shapetable wordlist2dawg frequent-words-list.txt eng2.freq-dawg eng2.unicharset combine_tessdata eng2. NOTE the dot (.) on the end tesseract training has to be installed for this to work. Now, move the "eng2.traineddata" file to the "tessdata" directory (folder) on your system. I.e. to use newly created traineddata file one of the ways to use it would be: TXT extension added by tesseract to the "text.result" and result file ends-up as "text.result.txt" |
RE: Training tesseract 4 - python programsPosted: Friday, November 27, 2020 [23:52:14] - 5
Python programs used in programs above: read.py from transform import four_point_transform import cv2 import numpy as np import argparse import imutils ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="path to input image file") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) ratio = image.shape[0] / 500.0 orig = image.copy() image = imutils.resize(image, height = 500) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.GaussianBlur(gray, (5, 5), 0) edged = cv2.Canny(gray, 5, 80) ## good for contrast images cnts = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) cnts = imutils.grab_contours(cnts) cnts = sorted(cnts, key = cv2.contourArea, reverse = True)[:5] for c in cnts: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 0.02 * peri, True) if len(approx) == 4: screenCnt = approx break try: screenCnt except NameError: print("No count defined") else: print("STEP 2: Find contours of paper") cv2.drawContours(image, [screenCnt], -1, (0, 255, 0), 2) warped = four_point_transform(orig, screenCnt.reshape(4, 2) * ratio) warped = cv2.cvtColor(warped, cv2.COLOR_BGR2GRAY) #print("STEP 3: Apply perspective transform") imgname = '/http/store/incoming/receipt.tif' cv2.imwrite(imgname, warped) find.duo2.py import cv2 import numpy as np import argparse ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="path to input image file") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.GaussianBlur(gray, (3, 3), 0) edged = cv2.Canny(gray, 10, 250) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25)) closed = cv2.morphologyEx(edged, cv2.MORPH_CLOSE, kernel) (im2,cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) noim = 0 for c in cnts: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 0.02 * peri, True) if len(approx) >= 3: x, y, w, h = cv2.boundingRect(c) if w > 5 and h > 1: noim += 1 roi = image[y:y+h, x:x+w] imgname = '/http/store/temp1/'+str(noim)+'.tif' cv2.imwrite(imgname, roi) print str(noim)," ",x," ",y," ",w," ",h rotate.duo.n.py import cv2 import numpy as np import argparse ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="path to input image file") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.bitwise_not(gray) thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] coords = np.column_stack(np.where(thresh > 0)) angle = cv2.minAreaRect(coords)[-1] if angle < -45: angle = -(90 + angle) else: angle = -angle if angle and abs(angle) <= 1: print("{:.3f}".format(angle)) else: print "0" train.py import cv2 import numpy as np import argparse ap = argparse.ArgumentParser() ap.add_argument("-i", "--image", required=True, help="path to input image file") args = vars(ap.parse_args()) image = cv2.imread(args["image"]) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) gray = cv2.GaussianBlur(gray, (3, 3), 0) edged = cv2.Canny(gray, 10, 250) ret,thresh = cv2.threshold(edged, 127,255,0) kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5)) #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (100, 100)) #kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25, 25)) closed = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel) (im2,cnts, _) = cv2.findContours(closed.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) noim = 0 for c in cnts: peri = cv2.arcLength(c, True) approx = cv2.approxPolyDP(c, 0.02 * peri, True) if len(approx) >= 3: x, y, w, h = cv2.boundingRect(c) if w > 30 and h > 60: noim += 1 roi = image[y:y+h, x:x+w] imgname = '/http/store/temp/'+str(noim)+'.tif' cv2.imwrite(imgname, roi) print str(noim)," ",x," ",y," ",w," ",h Happy coding! |