changeset 945:fd7503352fd4

post fmt3: simplify UTF8 char handling Instead of relying on the grammar to reassemble UTF-8 codepoints, do it in the lexer. Not only should this be more efficient, it will allow us to do more advanced unicode handling in libjeffpc. Signed-off-by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
author Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
date Mon, 26 Mar 2018 22:30:51 -0400
parents 5b0116b502d1
children b2025204a62d
files post_fmt3.l post_fmt3.y
diffstat 2 files changed, 4 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/post_fmt3.l	Mon Mar 26 22:31:44 2018 -0400
+++ b/post_fmt3.l	Mon Mar 26 22:30:51 2018 -0400
@@ -106,9 +106,8 @@
 '{1,2}			{ yylval->ptr = STR_DUP(yytext); return CQUOT; }
 [.,()/=!:;\+?@*#|]	{ yylval->ptr = STR_DUP(yytext); return CHAR; }
 ["<>]			{ yylval->ptr = STR_DUP(yytext); return SCHAR; }
-[\xe0-\xef]		{ yylval->ptr = STR_DUP(yytext); return UTF8FIRST3; }
-[\xc0-\xdf]		{ yylval->ptr = STR_DUP(yytext); return UTF8FIRST2; }
-[\x80-\xbf]		{ yylval->ptr = STR_DUP(yytext); return UTF8REST; }
+[\xe0-\xef][\x80-\xbf][\x80-\xbf]	{ yylval->ptr = STR_DUP(yytext); return UTF8CHAR; }
+[\xc0-\xdf][\x80-\xbf]			{ yylval->ptr = STR_DUP(yytext); return UTF8CHAR; }
 [A-Za-z0-9]+		{ yylval->ptr = STR_DUP(yytext); return WORD; }
 .			{ fmt3_error2("post text contains invalid characters", yytext); yyterminate(); }
 %%
--- a/post_fmt3.y	Mon Mar 26 22:31:44 2018 -0400
+++ b/post_fmt3.y	Mon Mar 26 22:30:51 2018 -0400
@@ -140,7 +140,7 @@
 /* generic tokens */
 %token <ptr> WSPACE
 %token <ptr> DASH OQUOT CQUOT SCHAR CHAR
-%token <ptr> UTF8FIRST3 UTF8FIRST2 UTF8REST WORD
+%token <ptr> UTF8CHAR WORD
 %token PERCENT ELLIPSIS
 %token PAREND
 
@@ -180,8 +180,7 @@
           ;
 
 thing : WORD				{ $$ = $1; }
-      | UTF8FIRST2 UTF8REST		{ $$ = str_cat(2, $1, $2); }
-      | UTF8FIRST3 UTF8REST UTF8REST	{ $$ = str_cat(3, $1, $2, $3); }
+      | UTF8CHAR			{ $$ = $1; }
       | '\n'				{ $$ = data->texttt_nesting ? STATIC_STR("\n") : STATIC_STR(" "); }
       | WSPACE				{ $$ = $1; }
       | DASH				{ $$ = dash($1); }