Mercurial > dovecot > core-2.2
changeset 18643:2cfb80f7785e
lib-fts: Reverted e80969ea8684 which replaced .sh scripts with awk
Bugs in older awk versions (used at least by Debian squeeze & wheezy) caused
awk to crash while processing the script.
author | Timo Sirainen <tss@iki.fi> |
---|---|
date | Tue, 12 May 2015 12:20:56 +0300 |
parents | 7d52d6595f5e |
children | e991baeb8bb7 |
files | src/lib-fts/Makefile.am src/lib-fts/word-boundary-data.awk src/lib-fts/word-boundary-data.sh src/lib-fts/word-break-data.awk src/lib-fts/word-break-data.sh |
diffstat | 5 files changed, 182 insertions(+), 211 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/Makefile.am Mon May 11 22:38:38 2015 +0300 +++ b/src/lib-fts/Makefile.am Tue May 12 12:20:56 2015 +0300 @@ -22,20 +22,20 @@ udhr_fra.txt \ PropList.txt \ WordBreakProperty.txt \ - word-boundary-data.awk \ + word-boundary-data.sh \ word-boundary-data.c \ - word-break-data.awk \ + word-break-data.sh \ word-break-data.c WordBreakProperty.txt: test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt -$(srcdir)/word-boundary-data.c: word-boundary-data.awk WordBreakProperty.txt - $(AWK) -f $(srcdir)/word-boundary-data.awk < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@ +$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt + $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@ PropList.txt: test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt -$(srcdir)/word-break-data.c: word-break-data.awk PropList.txt - $(AWK) -f $(srcdir)/word-break-data.awk < PropList.txt > $@.tmp && mv $@.tmp $@ +$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt + $(srcdir)/word-break-data.sh < PropList.txt > $@.tmp && mv $@.tmp $@ if BUILD_FTS_STEMMER
--- a/src/lib-fts/word-boundary-data.awk Mon May 11 22:38:38 2015 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,103 +0,0 @@ -#!/usr/bin/awk -f - -# -# converts strings to hex numbers (gawk's strtonum function) -# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function -# -function mystrtonum(str) { - # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function - if (str ~ /^0[xX][[:xdigit:]]+$/) { - str = substr(str, 3) # lop off leading 0x - n = length(str) - ret = 0 - for (i = 1; i <= n; i++) { - c = substr(str, i, 1) - c = tolower(c) - # index() returns 0 if c not in string, - # includes c == "0" - k = index("123456789abcdef", c) - ret = ret * 16 + k - } - } else { - ret = "NOT-A-HEX-NUMBER" - } - return ret -} - -# -# expand number ranges (from..to) to sequences of numbers (emulate seq function) -# -function add_hexrange (start, end) { - from = mystrtonum("0x"start) - to = mystrtonum("0x"end) - for ( i=from; i<=to; i++ ) - temp[i] = i - result = temp[from] - for ( i=from+1; i<=to; i++ ) - result = result " " temp[i] - return result -} - -# -# initialization stuff (define categories of intrest in input file) -# -BEGIN { - FS = " " - ncategories = split("CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter \ - Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet", array_names) -} - -# -# evaluate every line in input read from <stdin> -# -{ - # skip comments and empty lines - if ( $0 !~ /^#/ && NF != 0 ) { - # cycle over array_names and do the math - for (category in array_names) { - # identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors, - # might be suited regex preferable!) - if ( $0 ~ "; "array_names[category]" #" ) { - # distinguish beetween single numbers and number ranges (from..to) - if ( $1 ~ /\.\./ ) { - split($1, bounderies, "\.") - array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3]) - } else { - array[category] = array[category] " " mystrtonum("0x"$1) - } - } - } - } -} - -# -# format output to <stdout> -# -END { - print "/* This file is automatically generated by word-boundary-data.awk from WordBreakProperty.txt */" - for (category=1; category<=ncategories; category++) { - n = split(array[category], integers) - print "static const uint32_t "array_names[category]"[]= {" - if (n == 1) { - # split puts '0' into integers if arraysize equals to 1, thus: - printf("\t0x%05X", array[category]) - } else { - for ( i=1; i<=n; i++) { - if ( i == 1 ) { - printf("\t0x%05X, ", integers[i]) - } else if ( (i-1)%8 == 0 ) { - if ( i != n ) { - printf("\n\t0x%05X, ", integers[i]) - } else { - printf("\n\t0x%05X", integers[i]) - } - } else if ( i != n ) { - printf("0x%05X, ", integers[i]) - } else { - printf("0x%05X", integers[i]) - } - } - } - print "\n};" - } -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/word-boundary-data.sh Tue May 12 12:20:56 2015 +0300 @@ -0,0 +1,99 @@ +#!/bin/bash +# TODO: Should perhaps be written in perl/python/awk +# FIXME: The runtime is a bit long. + +#Array names match category names in data file. +declare -a CR +declare -a LF +declare -a Newline +declare -a Extend +declare -a Regional_Indicator +declare -a Format +declare -a Katakana +declare -a Hebrew_Letter +declare -a ALetter +declare -a Single_Quote +declare -a Double_Quote +declare -a MidNumLet +declare -a MidLetter +declare -a MidNum +declare -a Numeric +declare -a ExtendNumLet + +WIDTH=5 + +add_hexrange () { + + array_name="$1" + from="$2" + to="$3" + + eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))" +} + +print_c_array () { + + array_name="$1" + eval "array=("\${$array_name[@]}")" + array_length=${#array[@]} + i=1 + + printf "static const uint32_t %s[]= {\n\t" "$array_name" + + for val in "${array[@]}" ; do + printf "0x%0${WIDTH}X" "$val" + if [ $i -lt $array_length ]; then + echo -n ", " + if [ $(($i%8)) -eq 0 ]; then + echo -ne "\n\t" + fi + i=$((i+1)) + else + break + fi + done + + echo -ne "\n};\n" +} +#read everything except comments. +while read -s -a line; do + [ -z "${line[0]}" ] && continue #ignore empty lines + + case "${line[0]}" in \#*) continue ;; esac #ignore comments + + value="${line[0]}" + category="${line[2]}" + + case "$value" in + *..*) + start=`echo "$value" | cut -d . -f 1` + end=`echo "$value" | cut -d . -f 3` + add_hexrange "$category" "$start" "$end" + ;; + *) + value=`printf "%05X" $((16#$value))` + eval "$category+=(0x\$value)" + ;; + esac; + +done + +printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0" + +print_c_array CR +print_c_array LF +print_c_array Newline +print_c_array Extend +print_c_array Regional_Indicator +print_c_array Format +print_c_array Katakana +print_c_array Hebrew_Letter +print_c_array ALetter +print_c_array Single_Quote +print_c_array Double_Quote +print_c_array MidNumLet +print_c_array MidLetter +print_c_array MidNum +print_c_array Numeric +print_c_array ExtendNumLet +
--- a/src/lib-fts/word-break-data.awk Mon May 11 22:38:38 2015 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,102 +0,0 @@ -#!/usr/bin/awk -f - -# -# converts strings to hex numbers (gawk's strtonum function) -# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function -# -function mystrtonum(str) { - # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function - if (str ~ /^0[xX][[:xdigit:]]+$/) { - str = substr(str, 3) # lop off leading 0x - n = length(str) - ret = 0 - for (i = 1; i <= n; i++) { - c = substr(str, i, 1) - c = tolower(c) - # index() returns 0 if c not in string, - # includes c == "0" - k = index("123456789abcdef", c) - ret = ret * 16 + k - } - } else { - ret = "NOT-A-HEX-NUMBER" - } - return ret -} - -# -# expand number ranges (from..to) to sequences of numbers (emulate seq function) -# -function add_hexrange (start, end) { - from = mystrtonum("0x"start) - to = mystrtonum("0x"end) - for ( i=from; i<=to; i++ ) - temp[i] = i - result = temp[from] - for ( i=from+1; i<=to; i++ ) - result = result " " temp[i] - return result -} - -# -# initialization stuff (define categories of intrest in input file) -# -BEGIN { - FS = " " - ncategories = split("White_Space Dash Terminal_Punctuation STerm Pattern_White_Space", array_names) -} - -# -# evaluate every line in input read from <stdin> -# -{ - # skip comments and empty lines - if ( $0 !~ /^#/ && NF != 0 ) { - # cycle over array_names and do the math - for (category in array_names) { - # identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors, - # might be suited regex preferable!) - if ( $0 ~ "; "array_names[category]" #" ) { - # distinguish beetween single numbers and number ranges (from..to) - if ( $1 ~ /\.\./ ) { - split($1, bounderies, "\.") - array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3]) - } else { - array[category] = array[category] " " mystrtonum("0x"$1) - } - } - } - } -} - -# -# format output to <stdout> -# -END { - print "/* This file is automatically generated by word-break-data.awk from PropList.txt */" - for (category=1; category<=ncategories; category++) { - n = split(array[category], integers) - print "static const uint32_t "array_names[category]"[]= {" - if (n == 1) { - # split puts '0' into integers if arraysize equals to 1, thus: - printf("\t0x%05X", array[category]) - } else { - for ( i=1; i<=n; i++) { - if ( i == 1 ) { - printf("\t0x%05X, ", integers[i]) - } else if ( (i-1)%8 == 0 ) { - if ( i != n ) { - printf("\n\t0x%05X, ", integers[i]) - } else { - printf("\n\t0x%05X", integers[i]) - } - } else if ( i != n ) { - printf("0x%05X, ", integers[i]) - } else { - printf("0x%05X", integers[i]) - } - } - } - print "\n};" - } -}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/word-break-data.sh Tue May 12 12:20:56 2015 +0300 @@ -0,0 +1,77 @@ +#!/bin/bash + +#Array names match category names in data file. +array_names="White_Space Dash Terminal_Punctuation STerm Pattern_White_Space" +declare -a White_Space +declare -a Dash +declare -a Terminal_Punctuation +declare -a STerm +declare -a Pattern_White_Space +#TODO include Pattern_Syntax? + +WIDTH=5 + +add_hexrange () { + + array_name="$1" + from="$2" + to="$3" + + eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))" +} + +print_c_array () { + + array_name="$1" + eval "array=("\${$array_name[@]}")" + array_length=${#array[@]} + i=1 + + printf "static const uint32_t %s[]= {\n\t" "$array_name" + + for val in "${array[@]}" ; do + printf "0x%0${WIDTH}X" "$val" + if [ $i -lt $array_length ]; then + echo -n ", " + if [ $(($i%8)) -eq 0 ]; then + echo -ne "\n\t" + fi + i=$((i+1)) + else + break + fi + done + + echo -ne "\n};\n" +} +#read everything except comments. +while read -s -a line; do + [ -z "${line[0]}" ] && continue #ignore empty lines + + case "${line[0]}" in \#*) continue ;; esac #ignore comments + + value="${line[0]}" + category="${line[2]}" + + case "$array_names" in + *"$category"*) + case "$value" in + *..*) + start=`echo "$value" | cut -d . -f 1` + end=`echo "$value" | cut -d . -f 3` + add_hexrange "$category" "$start" "$end" + ;; + *) + value=`printf "%05X" $((16#$value))` + eval "$category+=(0x\$value)" + ;; + esac + ;; + esac +done +printf "/* This file is automatically generated by %s from PropList.txt */\n" "$0" + +for name in $array_names; do + print_c_array "$name" +done +