aboutsummaryrefslogtreecommitdiffstats
path: root/contrib
diff options
context:
space:
mode:
authorMario Domenech Goulart <mario@ossystems.com.br>2014-05-26 09:59:00 -0300
committerMartin Jansa <Martin.Jansa@gmail.com>2014-06-10 12:16:23 +0200
commitcb41796a5e0573bf3676b5c54fcc12c6dd42f9fb (patch)
tree5c7665d5744fea0f2ee55efd620ab06736ee2956 /contrib
parentfccc8f1514c5bc84a343a757f60c9dc793db2643 (diff)
downloadmeta-openembedded-contrib-cb41796a5e0573bf3676b5c54fcc12c6dd42f9fb.tar.gz
contrib/tesseract-langs.sh: add script to generate recipes for tesseract languages
This script writes language recipes for tesseract. It downloads the listing of available languages and language tarballs from the official site and writes language recipes tesseract-lang-<lang>_<version>.bb for each language. Signed-off-by: Mario Domenech Goulart <mario@ossystems.com.br> Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
Diffstat (limited to 'contrib')
-rwxr-xr-xcontrib/tesseract-langs.sh92
1 files changed, 92 insertions, 0 deletions
diff --git a/contrib/tesseract-langs.sh b/contrib/tesseract-langs.sh
new file mode 100755
index 0000000000..50873c139b
--- /dev/null
+++ b/contrib/tesseract-langs.sh
@@ -0,0 +1,92 @@
+#! /bin/sh
+
+# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
+# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
+
+PV='3.02'
+
+# Sometimes the software package has a minor version, but language
+# packages have not. Example:
+# software package: tesseract-ocr-3.02.02.tar.gz
+# language package: tesseract-ocr-3.02.por.tar.gz
+MINOR_PV=02
+
+recipes_dir=$1
+
+usage() {
+ echo "Usage: `basename $0` <recipes dir> [ <download dir> ]"
+}
+
+if [ -z "$recipes_dir" ]; then
+ usage
+ exit 1
+fi
+mkdir -p "$recipes_dir"
+
+file_list_uri='https://code.google.com/p/tesseract-ocr/downloads/list'
+file_list=`mktemp`
+
+remove_dl_dir=
+if [ -z "$2" ]; then
+ remove_dl_dir=1
+ dl_dir=`mktemp -d`
+else
+ dl_dir="$2"
+fi
+
+mkdir -p $dl_dir
+
+tesseract_langs() {
+ wget -q -O "$file_list" "$file_list_uri"
+
+ grep -E 'a href="detail\?name=tesseract-ocr-'${PV}'\.[^\.]+.tar.gz&amp;can=2&amp;q=">' "$file_list" | \
+ sed -r -e 's/.*tesseract-ocr-'${PV}'\.*([^\.]+)\.tar\.gz.*/\1/' | \
+ grep -Ev '('${MINOR_PV}'|'${MINOR_PV}'-doc-html)' | \
+ sort -u
+}
+
+download_lang_files() {
+ local langs="$1"
+ local uri
+ for lang in $langs; do
+ if [ ! -e "$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz" ]; then
+ uri="https://tesseract-ocr.googlecode.com/files/tesseract-ocr-${PV}.${lang}.tar.gz"
+ echo "Downloading $uri"
+ wget -q -P "$dl_dir" "$uri"
+ fi
+ done
+}
+
+create_recipe() {
+ local lang=$1
+ local tarball
+
+ tarball="$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz"
+
+ md5sum=`md5sum $tarball | awk '{print $1}'`
+ sha256sum=`sha256sum $tarball | awk '{print $1}'`
+
+ cat > $recipes_dir/tesseract-lang-`echo ${lang} | sed s/_/-/g`_${PV}.bb <<EOF
+# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
+# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
+
+TESSERACT_LANG = "$lang"
+
+require tesseract-lang.inc
+
+SRC_URI[md5sum] = "${md5sum}"
+SRC_URI[sha256sum] = "${sha256sum}"
+EOF
+}
+
+
+LANGS=`tesseract_langs`
+
+download_lang_files "$LANGS"
+
+for lang in $LANGS; do
+ create_recipe $lang
+done
+
+[ -n "$remove_dl_dir" ] && rm -rf $dl_dir
+rm -f $file_list