aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMario Domenech Goulart <mario@ossystems.com.br>2014-05-26 09:59:00 -0300
committerMartin Jansa <Martin.Jansa@gmail.com>2014-06-10 12:16:23 +0200
commitcb41796a5e0573bf3676b5c54fcc12c6dd42f9fb (patch)
tree5c7665d5744fea0f2ee55efd620ab06736ee2956
parentfccc8f1514c5bc84a343a757f60c9dc793db2643 (diff)
downloadmeta-openembedded-contrib-cb41796a5e0573bf3676b5c54fcc12c6dd42f9fb.tar.gz
contrib/tesseract-langs.sh: add script to generate recipes for tesseract languages
This script writes language recipes for tesseract. It downloads the listing of available languages and language tarballs from the official site and writes language recipes tesseract-lang-<lang>_<version>.bb for each language. Signed-off-by: Mario Domenech Goulart <mario@ossystems.com.br> Signed-off-by: Martin Jansa <Martin.Jansa@gmail.com>
-rwxr-xr-xcontrib/tesseract-langs.sh92
1 files changed, 92 insertions, 0 deletions
diff --git a/contrib/tesseract-langs.sh b/contrib/tesseract-langs.sh
new file mode 100755
index 0000000000..50873c139b
--- /dev/null
+++ b/contrib/tesseract-langs.sh
@@ -0,0 +1,92 @@
+#! /bin/sh
+
+# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
+# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
+
+PV='3.02'
+
+# Sometimes the software package has a minor version, but language
+# packages have not. Example:
+# software package: tesseract-ocr-3.02.02.tar.gz
+# language package: tesseract-ocr-3.02.por.tar.gz
+MINOR_PV=02
+
+recipes_dir=$1
+
+usage() {
+ echo "Usage: `basename $0` <recipes dir> [ <download dir> ]"
+}
+
+if [ -z "$recipes_dir" ]; then
+ usage
+ exit 1
+fi
+mkdir -p "$recipes_dir"
+
+file_list_uri='https://code.google.com/p/tesseract-ocr/downloads/list'
+file_list=`mktemp`
+
+remove_dl_dir=
+if [ -z "$2" ]; then
+ remove_dl_dir=1
+ dl_dir=`mktemp -d`
+else
+ dl_dir="$2"
+fi
+
+mkdir -p $dl_dir
+
+tesseract_langs() {
+ wget -q -O "$file_list" "$file_list_uri"
+
+ grep -E 'a href="detail\?name=tesseract-ocr-'${PV}'\.[^\.]+.tar.gz&amp;can=2&amp;q=">' "$file_list" | \
+ sed -r -e 's/.*tesseract-ocr-'${PV}'\.*([^\.]+)\.tar\.gz.*/\1/' | \
+ grep -Ev '('${MINOR_PV}'|'${MINOR_PV}'-doc-html)' | \
+ sort -u
+}
+
+download_lang_files() {
+ local langs="$1"
+ local uri
+ for lang in $langs; do
+ if [ ! -e "$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz" ]; then
+ uri="https://tesseract-ocr.googlecode.com/files/tesseract-ocr-${PV}.${lang}.tar.gz"
+ echo "Downloading $uri"
+ wget -q -P "$dl_dir" "$uri"
+ fi
+ done
+}
+
+create_recipe() {
+ local lang=$1
+ local tarball
+
+ tarball="$dl_dir/tesseract-ocr-${PV}.${lang}.tar.gz"
+
+ md5sum=`md5sum $tarball | awk '{print $1}'`
+ sha256sum=`sha256sum $tarball | awk '{print $1}'`
+
+ cat > $recipes_dir/tesseract-lang-`echo ${lang} | sed s/_/-/g`_${PV}.bb <<EOF
+# Copyright (C) 2014, O.S. Systems Software Ltda. All Rights Reserved
+# Released under the MIT license (see meta-openembedded layer's COPYING.MIT)
+
+TESSERACT_LANG = "$lang"
+
+require tesseract-lang.inc
+
+SRC_URI[md5sum] = "${md5sum}"
+SRC_URI[sha256sum] = "${sha256sum}"
+EOF
+}
+
+
+LANGS=`tesseract_langs`
+
+download_lang_files "$LANGS"
+
+for lang in $LANGS; do
+ create_recipe $lang
+done
+
+[ -n "$remove_dl_dir" ] && rm -rf $dl_dir
+rm -f $file_list