Mercurial > dovecot > core-2.2
changeset 18646:58d7234a6658
lib-fts: autogenerate C arrays using perl
The sh script had bashisms, the awk script crashed mawk, so let's try perl...
Signed-off-by: Phil Carmody <phil@dovecot.fi>
author | Phil Carmody <phil@dovecot.fi> |
---|---|
date | Tue, 12 May 2015 16:12:29 +0300 |
parents | 0cbb125046a5 |
children | d09d2ea2c31a |
files | src/lib-fts/Makefile.am src/lib-fts/word-boundary-data.sh src/lib-fts/word-break-data.sh src/lib-fts/word-properties.pl |
diffstat | 4 files changed, 39 insertions(+), 182 deletions(-) [+] |
line wrap: on
line diff
--- a/src/lib-fts/Makefile.am Tue May 12 12:45:34 2015 +0300 +++ b/src/lib-fts/Makefile.am Tue May 12 16:12:29 2015 +0300 @@ -21,21 +21,20 @@ EXTRA_DIST = \ udhr_fra.txt \ PropList.txt \ + word-properties.pl \ WordBreakProperty.txt \ - word-boundary-data.sh \ word-boundary-data.c \ - word-break-data.sh \ word-break-data.c WordBreakProperty.txt: test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt -$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt - bash $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@ +$(srcdir)/word-boundary-data.c: word-properties.pl PropList.txt + perl word-properties.pl boundaries WordBreakProperty.txt > $@.tmp && mv $@.tmp $@ PropList.txt: test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt -$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt - bash $(srcdir)/word-break-data.sh < PropList.txt > $@.tmp && mv $@.tmp $@ +$(srcdir)/word-break-data.c: word-properties.pl PropList.txt + perl word-properties.pl breaks PropList.txt > $@.tmp && mv $@.tmp $@ if BUILD_FTS_STEMMER
--- a/src/lib-fts/word-boundary-data.sh Tue May 12 12:45:34 2015 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -#!/bin/bash -# TODO: Should perhaps be written in perl/python/awk -# FIXME: The runtime is a bit long. - -#Array names match category names in data file. -declare -a CR -declare -a LF -declare -a Newline -declare -a Extend -declare -a Regional_Indicator -declare -a Format -declare -a Katakana -declare -a Hebrew_Letter -declare -a ALetter -declare -a Single_Quote -declare -a Double_Quote -declare -a MidNumLet -declare -a MidLetter -declare -a MidNum -declare -a Numeric -declare -a ExtendNumLet - -WIDTH=5 - -add_hexrange () { - - array_name="$1" - from="$2" - to="$3" - - eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))" -} - -print_c_array () { - - array_name="$1" - eval "array=("\${$array_name[@]}")" - array_length=${#array[@]} - i=1 - - printf "static const uint32_t %s[]= {\n\t" "$array_name" - - for val in "${array[@]}" ; do - printf "0x%0${WIDTH}X" "$val" - if [ $i -lt $array_length ]; then - echo -n ", " - if [ $(($i%8)) -eq 0 ]; then - echo -ne "\n\t" - fi - i=$((i+1)) - else - break - fi - done - - echo -ne "\n};\n" -} -#read everything except comments. -while read -s -a line; do - [ -z "${line[0]}" ] && continue #ignore empty lines - - case "${line[0]}" in \#*) continue ;; esac #ignore comments - - value="${line[0]}" - category="${line[2]}" - - case "$value" in - *..*) - start=`echo "$value" | cut -d . -f 1` - end=`echo "$value" | cut -d . -f 3` - add_hexrange "$category" "$start" "$end" - ;; - *) - value=`printf "%05X" $((16#$value))` - eval "$category+=(0x\$value)" - ;; - esac; - -done - -printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0" - -print_c_array CR -print_c_array LF -print_c_array Newline -print_c_array Extend -print_c_array Regional_Indicator -print_c_array Format -print_c_array Katakana -print_c_array Hebrew_Letter -print_c_array ALetter -print_c_array Single_Quote -print_c_array Double_Quote -print_c_array MidNumLet -print_c_array MidLetter -print_c_array MidNum -print_c_array Numeric -print_c_array ExtendNumLet -
--- a/src/lib-fts/word-break-data.sh Tue May 12 12:45:34 2015 +0300 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,77 +0,0 @@ -#!/bin/bash - -#Array names match category names in data file. -array_names="White_Space Dash Terminal_Punctuation STerm Pattern_White_Space" -declare -a White_Space -declare -a Dash -declare -a Terminal_Punctuation -declare -a STerm -declare -a Pattern_White_Space -#TODO include Pattern_Syntax? - -WIDTH=5 - -add_hexrange () { - - array_name="$1" - from="$2" - to="$3" - - eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))" -} - -print_c_array () { - - array_name="$1" - eval "array=("\${$array_name[@]}")" - array_length=${#array[@]} - i=1 - - printf "static const uint32_t %s[]= {\n\t" "$array_name" - - for val in "${array[@]}" ; do - printf "0x%0${WIDTH}X" "$val" - if [ $i -lt $array_length ]; then - echo -n ", " - if [ $(($i%8)) -eq 0 ]; then - echo -ne "\n\t" - fi - i=$((i+1)) - else - break - fi - done - - echo -ne "\n};\n" -} -#read everything except comments. -while read -s -a line; do - [ -z "${line[0]}" ] && continue #ignore empty lines - - case "${line[0]}" in \#*) continue ;; esac #ignore comments - - value="${line[0]}" - category="${line[2]}" - - case "$array_names" in - *"$category"*) - case "$value" in - *..*) - start=`echo "$value" | cut -d . -f 1` - end=`echo "$value" | cut -d . -f 3` - add_hexrange "$category" "$start" "$end" - ;; - *) - value=`printf "%05X" $((16#$value))` - eval "$category+=(0x\$value)" - ;; - esac - ;; - esac -done -printf "/* This file is automatically generated by %s from PropList.txt */\n" "$0" - -for name in $array_names; do - print_c_array "$name" -done -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/lib-fts/word-properties.pl Tue May 12 16:12:29 2015 +0300 @@ -0,0 +1,34 @@ +#!/usr/bin/env perl +use strict; +use warnings; + +my @categories; +my $which = shift(@ARGV); +if ($which eq 'boundaries') { + @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter + Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet); +} elsif ($which eq 'breaks') { + @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space); +} else { + die "specify 'boundaries' or 'breaks'"; +} + +my $catregexp=join('|', @categories); +my %catlists = map { $_ => []; } (@categories); + +while(<>) { + next if (m/^#/ or m/^\s*$/); + push(@{$catlists{$3}}, defined($2) ? (hex($1)..hex($2)) : hex($1)) + if (m/([[:xdigit:]]+)(?:\.\.([[:xdigit:]]+))?\s+; ($catregexp) #/) +} + +print "/* This file is automatically generated by word-properties.pl from $ARGV */\n"; +foreach(@categories) { + my $arref=$catlists{$_}; + print "static const uint32_t ${_}[]= {\n"; + while(scalar(@$arref)) { + print("\t", join(", ", map { sprintf("0x%05X", $_); } splice(@$arref, 0, 8))); + print(scalar(@$arref) ? ", \n" : "\n"); + } + print("};\n"); +}