我正在升级以前使用的Ecmascript引擎Quex 0.64.8
到Quex 0.67.5
。我有词法分析器和
正在运行,现在看来只能检测到ANSI token ,
而不是以前的UTF-8 token 。
本质上,我要做的是在同时提供--codec utf8
标志
运行Quex并使用以下代码来标识标识符:
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
这个想法是而不是指定我定义的所有允许的 token
相反,这些是 Not Acceptable ,并排除了这些。新的词法分析器
可以很好地检测到诸如“test1”或“safari”之类的标识符,但似乎
对“日本语”和“Örjan”有疑问。我也只用utf-8
并且不要使用ICU或Iconv。
感觉好像我在这里误解了一些东西。任何帮助
解决这个问题将不胜感激。
编辑:
知道我使用以下参数运行Quex可能会很有用:
-i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer
--foreign-token-id-file ${BISON_ECMASCRIPT_PARSER_OUTPUT_HEADER}
--token-id-prefix TOK_
--template-compression
--codec utf8 //--encoding utf8 since Quex 0.67.5
--buffer-element-size 1
--buffer-element-type char
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated
--language c++
--warning-on-outrun
编辑2:
自utf-8解析以来,我无法重新创建一个小示例
在示例中工作。因此,我创建了一个独立的
我的ecmascript引擎的lexer部分的版本,
希望能更容易发现问题所在。
我现在不清楚我的问题是否真的与我有关
解析utf8 token 。很有可能出了点问题
而是在我的.qx文件中...无论哪种方式,这是独立版本
我的ecmascript词法分析器。
CMakeLists.txt
cmake_minimum_required(VERSION 2.8)
project(ecmascript CXX)
if(MSVC)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
endif()
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)
set(QUEX_NAMES "quex")
if(CMAKE_HOST_WIN32)
set(QUEX_NAMES "quex.bat" ${QUEX_NAMES})
else()
set(QUEX_NAMES "quex-exe.py" ${QUEX_NAMES})
endif()
find_program(QUEX NAMES ${QUEX_NAMES} REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's executable."
NO_DEFAULT_PATH)
find_path(QUEX_INCLUDE_DIR quex/core.py REQUIRED
HINTS ENV QUEX_PATH DOC "Path to Quex's include directory"
NO_DEFAULT_PATH)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/generated)
add_definitions(-DQUEX_OPTION_LINE_NUMBER_COUNTING
-DQUEX_OPTION_ASSERTS_DISABLED)
add_definitions(-DQUEX_SETTING_BUFFER_SIZE=1024) # Sätter bufferstorleken på lexern
set(ECMASCRIPT_LEXER ${CMAKE_CURRENT_BINARY_DIR}/generated/ecmascript_lexer)
add_custom_command(OUTPUT ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
${ECMASCRIPT_LEXER}-token
${ECMASCRIPT_LEXER}-configuration
${ECMASCRIPT_LEXER}-token_ids
COMMAND ${QUEX} -i ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx
--analyzer-class ecmascript_lexer # Namnet på lexern
--foreign-token-id-file ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript_yacc.hpp # token-id genereras av bison
--token-id-prefix TOK_ # Custom prefix för tokens (se ecmascript.y för detaljer)
--template-compression # Optimera lägesövergångar om möjligt
--encoding utf8 # Basera lexern på teckentabell UTF8
--buffer-element-size 1 # Använd en datatyp som är 1 byte stor
--buffer-element-type uint8_t
--odir ${CMAKE_CURRENT_BINARY_DIR}/generated # Berätta var alla genererade filer ska ligga
--language c++
DEPENDS ecmascript.qx VERBATIM
COMMENT "Generating ecmascript lexer..."
MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/ecmascript.qx) # Detta styr i vilken ordning Quex och Bison körs
include_directories(${QUEX_INCLUDE_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
${CMAKE_CURRENT_BINARY_DIR}/generated)
set(es_lexer ${ECMASCRIPT_LEXER}
${ECMASCRIPT_LEXER}.cpp
_main.cpp)
set(es_generated ${es_lexer} ecmascript_yacc.hpp)
add_executable(es_lexer ${es_generated})
ecmascript.qx
header {
#include <quex/code_base/extra/accumulator/Accumulator>
#include "ecmascript_yacc.hpp"
#include <cstdlib>
#include <cstdio>
#define BACKSPACE '\x08'
#define TAB '\x09'
#define NEWLINE '\x0A'
#define VERTICALTAB '\x0B'
#define FORMFEED '\x0C'
#define CARRIAGERETURN '\x0D'
#define DOUBLEQUOTE '\x22'
#define SINGLEQUOTE '\x27'
#define DOUBLEBACKSLASH '\x5C'
#define NULLTERM '\x00'
}
footer {
#include <quex/code_base/extra/accumulator/Accumulator.i>
}
define {
PATTERN_NEWLINE [\n\r]
PATTERN_DIGIT [0-9]
PATTERN_NOZDIGIT [1-9]
PATTERN_DECINTLIT "0"|{PATTERN_NOZDIGIT}{PATTERN_DIGIT}*
PATTERN_EXPIND "e"|"E"
PATTERN_SIGNEDINT {PATTERN_DIGIT}+|"+"{PATTERN_DIGIT}+|"-"{PATTERN_DIGIT}+
PATTERN_EXPPART {PATTERN_EXPIND}{PATTERN_SIGNEDINT}
PATTERN_DECNUMBER {PATTERN_DECINTLIT}"."{PATTERN_DIGIT}*{PATTERN_EXPPART}?|"."{PATTERN_DIGIT}+{PATTERN_EXPPART}?|{PATTERN_DECINTLIT}{PATTERN_EXPPART}?
PATTERN_HEXDIGIT [0-9a-fA-F]
PATTERN_HEXNUMBER "0x"{PATTERN_HEXDIGIT}+|"0X"{PATTERN_HEXDIGIT}+
PATTERN_UNIESCSEQ \\u{PATTERN_HEXDIGIT}{4}
PATTERN_STRING "\""(\\"\""|[^"])*"\""
PATTERN_DOUBLE_QUOTE_STRING_DELIMITER "\""
PATTERN_SINGLE_QUOTE_STRING_DELIMITER "'"
PATTERN_SINGLELINE_COMMENT "//"[^\n\r]*
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|{PATTERN_DIGIT}
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
}
mode EOF : <inheritable: only> {
on_end_of_stream {
self_send(TOK_LINETERM);
self_send(TOK_TERMINATION);
}
}
mode RestrictedProduction : EOF
<skip: [ \t]>
{
{PATTERN_NEWLINE}{
self_send(';');
self << Program;
}
on_failure {
self.undo();
self << Program;
}
}
mode StringHelper : EOF
<inheritable: only>
{
on_entry {
self_send(TOK_QUOTE);
}
on_exit {
if(self.accumulator.text.begin != self.accumulator.text.end)
self_send(TOK_STRLITPART);
self_accumulator_flush(TOK_QUOTE);
}
{PATTERN_NEWLINE} => '\n';
"\\b" { self_accumulator_add_character(BACKSPACE); }
"\\t" { self_accumulator_add_character(TAB); }
"\\n" { self_accumulator_add_character(NEWLINE); }
"\\v" { self_accumulator_add_character(VERTICALTAB); }
"\\f" { self_accumulator_add_character(FORMFEED); }
"\\r" { self_accumulator_add_character(CARRIAGERETURN); }
"\\\"" { self_accumulator_add_character(DOUBLEQUOTE); }
"\\'" { self_accumulator_add_character(SINGLEQUOTE); }
"\\\\" { self_accumulator_add_character(DOUBLEBACKSLASH); }
"\\0" { self_accumulator_add_character(NULLTERM); }
"\\x"{PATTERN_HEXDIGIT}{2}
{
{
unsigned long ulResult = strtoul(reinterpret_cast<char*>(Lexeme+2),0,16);
uint8_t *const pBuffer = reinterpret_cast<uint8_t*>(&ulResult);
self_accumulator_add(pBuffer,pBuffer+2);
}
}
on_failure {
self_accumulator_add(Lexeme, LexemeEnd);
}
}
mode SingleQuoteString : StringHelper
{
{PATTERN_SINGLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}
mode DoubleQuoteString : StringHelper
{
{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER}
{
// Om vi hittade slutet på strängen så växlar vi tillbaka till Program-läget
self << Program;
}
}
mode PrefixHelper : EOF
<skip: [ \t]> // Ignorera whitespace
{
on_entry {
self.seek_backward(3);
}
{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(';');
}
"++"
{
self_send(TOK_PLUSPLUS);
self << Program;
}
"--"
{
self_send(TOK_MINUSMINUS);
self << Program;
}
on_failure {
(void)Lexeme;
}
}
mode Operators : <inheritable: only>
{
"||" => TOK_OR;
"&&" => TOK_AND;
"++" { self << PrefixHelper; }
"--" { self << PrefixHelper; }
"===" => TOK_EQEQEQ;
"==" => TOK_EQEQ;
"!==" => TOK_NEQEQ;
"!=" => TOK_NEQ;
"*=" => TOK_MULTEQ;
"/=" => TOK_DIVEQ;
"%=" => TOK_MODEQ;
"+=" => TOK_PLUSEQ;
"\-=" => TOK_MINUSEQ;
">>>=" => TOK_GTGTGTEQ;
">>>" => TOK_GTGTGT;
"<<=" => TOK_LTLTEQ;
">>=" => TOK_GTGTEQ;
"<<" => TOK_LTLT;
">>" => TOK_GTGT;
"<=" => TOK_LTE;
">=" => TOK_GTE;
"&=" => TOK_AMPEQ;
"^=" => TOK_CIRCEQ;
"|=" => TOK_PIPEEQ;
['='] => '=';
['!'] => '!';
['('] { self_send('('); ++self.iParaCount; }
['+'] => '+';
['\-'] => '-';
['*'] => '*';
['/'] => '/';
['%'] => '%';
['<'] => '<';
['>'] => '>';
['\['] => '[';
['\]'] => ']';
['.'] => '.';
[','] => ',';
['?'] => '?';
[':'] => ':';
['~'] => '~';
['&'] => '&';
['^'] => '^';
['|'] => '|';
['{'] => '{';
[';'] => ';';
[')'] { self_send(')'); --self.iParaCount; }
['}'] { self_send(TOK_LINETERM); self_send('}'); }
}
mode Keywords : <inheritable: only>
{
function => TOK_FUNCTION;
return { self_send(TOK_RETURN); self << RestrictedProduction; }
var => TOK_VAR;
null => TOK_NULL;
true => TOK_TRUE;
false => TOK_FALSE;
instanceof => TOK_INSTANCEOF;
in => TOK_IN;
delete => TOK_DELETE;
void => TOK_VOID;
typeof => TOK_TYPEOF;
this => TOK_THIS;
if => TOK_IF;
else => TOK_ELSE;
with => TOK_WITH;
throw { self_send(TOK_THROW); self << RestrictedProduction; }
try => TOK_TRY;
catch => TOK_CATCH;
finally => TOK_FINALLY;
for => TOK_FOR;
break { self_send(TOK_BREAK); self << RestrictedProduction; }
continue { self_send(TOK_CONTINUE); self << RestrictedProduction; }
while => TOK_WHILE;
do => TOK_DO;
switch => TOK_SWITCH;
case => TOK_CASE;
default => TOK_DEFAULT;
new => TOK_NEW;
synchronized => TOK_SYNCHRONIZED;
}
mode Values : <inheritable: only>
{
{PATTERN_DECNUMBER} => TOK_DECLIT(Lexeme);
{PATTERN_HEXNUMBER} => TOK_HEXINTLIT(Lexeme);
{PATTERN_DOUBLE_QUOTE_STRING_DELIMITER} { self << DoubleQuoteString; }
{PATTERN_SINGLE_QUOTE_STRING_DELIMITER} { self << SingleQuoteString; }
}
mode Identifiers : <inheritable: only>
{
{PATTERN_ID} => TOK_ID(Lexeme);
}
mode Program : Keywords,
Identifiers,
Values,
Operators,
EOF
<skip: [ \t]>
<skip_range: "/*" "*/">
{
{PATTERN_NEWLINE}
{
if(self.iParaCount == 0)
self_send(TOK_LINETERM);
}
{PATTERN_SINGLELINE_COMMENT}
{}
}
body {
void push_token(const unsigned int uiToken)
{
self.uiLastToken = self.uiCurrentToken;
self.uiCurrentToken = uiToken;
}
bool use_auto_semi() const
{ return uiLastToken == TOK_LINETERM; }
unsigned int uiLastToken,
uiCurrentToken;
int iParaCount;
quex::Token* pLastID;
QUEX_NAME(Accumulator) accumulator;
}
constructor {
self.uiLastToken = 0;
self.uiCurrentToken = 0;
self.iParaCount = 0;
self.pLastID = 0;
if(!QUEX_NAME(Accumulator_construct)(&me->accumulator, me)) {
return false;
}
}
destructor {
QUEX_NAME(Accumulator_destruct)(&me->accumulator);
}
start = Program;
ecmascript_yacc.hpp
#ifndef YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
# define YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED
/* Token type. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
TOK_TERMINATION = 0,
TOK_UNINITIALIZED = 1,
TOK_ID = 258,
TOK_NULL = 259,
TOK_TRUE = 260,
TOK_FALSE = 261,
TOK_DECLIT = 262,
TOK_HEXINTLIT = 263,
TOK_OR = 264,
TOK_AND = 265,
TOK_PLUSPLUS = 266,
TOK_MINUSMINUS = 267,
TOK_EQEQ = 268,
TOK_NEQ = 269,
TOK_EQEQEQ = 270,
TOK_NEQEQ = 271,
TOK_LTE = 272,
TOK_GTE = 273,
TOK_INSTANCEOF = 274,
TOK_IN = 275,
TOK_STRLITPART = 276,
TOK_QUOTE = 277,
TOK_VOID = 278,
TOK_TYPEOF = 279,
TOK_DELETE = 280,
TOK_THIS = 281,
TOK_LTLT = 282,
TOK_GTGT = 283,
TOK_GTGTGT = 284,
TOK_MULTEQ = 285,
TOK_DIVEQ = 286,
TOK_MODEQ = 287,
TOK_PLUSEQ = 288,
TOK_MINUSEQ = 289,
TOK_LTLTEQ = 290,
TOK_GTGTEQ = 291,
TOK_GTGTGTEQ = 292,
TOK_AMPEQ = 293,
TOK_CIRCEQ = 294,
TOK_PIPEEQ = 295,
TOK_IF = 296,
TOK_ELSE = 297,
TOK_RETURN = 298,
TOK_VAR = 299,
TOK_WITH = 300,
TOK_THROW = 301,
TOK_TRY = 302,
TOK_CATCH = 303,
TOK_FINALLY = 304,
TOK_FOR = 305,
TOK_BREAK = 306,
TOK_CONTINUE = 307,
TOK_WHILE = 308,
TOK_DO = 309,
TOK_SWITCH = 310,
TOK_CASE = 311,
TOK_DEFAULT = 312,
TOK_NEW = 313,
TOK_FUNCTION = 314,
TOK_SYNCHRONIZED = 315,
TOK_LINETERM = 316
};
#endif
/* Value type. */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif
#endif /* !YY_ECMASCRIPT_YY_C_USERS_PATRIKJ_WORK_GIT_ECMASCRIPT_BUILD_VC14_X64_GENERATED_ECMASCRIPT_YACC_HPP_INCLUDED */
_main.cpp
#include <iostream>
#include "ecmascript_lexer"
/****************************************************************************************/
void print_token(quex::Token* token)
{
std::cout << token->get_string() << std::endl;
}
/****************************************************************************************/
int main(int argc, char** argv)
{
quex::Token* token = 0;
quex::ecmascript_lexer qlex;
quex::ecmascript_lexer *lexer = quex::ecmascript_lexer::from_file_name("id_test.js", 0);
while(lexer->error_code == E_Error_None)
{
get_token:
lexer->receive(&token);
if(!token)
break;
print_token(token);
lexer->push_token(token->type_id());
if(token->type_id() == TOK_LINETERM)
goto get_token;
if(token->type_id() == TOK_ID)
lexer->pLastID = token;
if(token->type_id() == TOK_TERMINATION)
break;
}
delete lexer;
return 0;
}
id_test.js //用于测试词法分析器
test1 = safari;
myFunc()
function t(){}
if(test1 < 23)
return myFunc(45);
myFunc();
svenskaåäö();
var kalleö = 34;
var _us=kalleö;
_us = 678
日本語 = "Nihongo" // Japanska språkets namn
$myCar = _us
var new1 = kalleö ? t();
"kalleÖ, _us and $myCar should be ignored here"
الفصحى = "Arabiska"
/*
var new1 = kalleÖ ? t();
"kalleÖ, _us and $myCar should be ignored here"
*/
// var new1 = kalleÖ ? t();
대한민국 = 45;
대한민국X45 = "Value of: 대한민국" + 대한민국;
ärta="ärta + 2"
mix帝With대한민국 = "success?";
Örjan;
önes;
cake;
Россия;
РоссияX;
РоссияX
XРоссия;
XРоссия;
始皇帝 = "The First emperor"
始皇帝x2 = "The First emperor, twice?"
最好的祝福,
帕特里克·J
最佳答案
我建议您特别依赖Unicode属性ID_Start
和ID_Continue
,以便您的.qx文件包含
define {
ID_START \P{ID_Start}
ID_CONTINUE \P{ID_Continue}
ID {ID_START}{ID_CONTINUE}*
}
然后Quex对UCS数据库进行采样,您不必担心
特定的代码点。
另外,如果您只想支持一个子集,请使用
intersection
切出所需的UCS范围,如以下示例所示:...
ID_START [: intersection([\X900-\X970], \P{ID_Start}) :]
ID_CONTINUE [: intersection([\X900-\X970], \P{ID_Continue}) :]
...
PS,您的解决方案并非完全错误。给定文件example.qx:
define {
PATTERN_IDSTART [^0-9+\-<>*()\[\]?=&|~\\/\^%!{}\n\t\r"':;,. ]
PATTERN_IDPART {PATTERN_IDSTART}|[0-9]
PATTERN_ID {PATTERN_IDSTART}{PATTERN_IDPART}*
}
token { ID; WS; }
mode X {
{PATTERN_ID} => QUEX_TKN_ID(Lexeme);
[ \n\t] => QUEX_TKN_WS(Lexeme);
}
还有一些用户文件“example.c”:
#include <stdio.h>
#include "EasyLexer.h"
void
print_token(quex_Token* token_p)
{
const size_t BufferSize = 1024;
char buffer[1024];
printf("%s \n", QUEX_NAME_TOKEN(get_string)(token_p, buffer, BufferSize));
}
int
main(int argc, char** argv)
{
quex_Token* token_p = NULL;
quex_EasyLexer qlex;
quex_EasyLexer_from_file_name(&qlex, "example.txt", NULL);
while( qlex.error_code == E_Error_None ) {
quex_EasyLexer_receive(&qlex, &token_p);
if( ! token_p ) break;
print_token(token_p);
if( token_p->_id == QUEX_TKN_TERMINATION ) break;
}
quex_EasyLexer_destruct(&qlex);
return 0;
}
然后在命令行上执行:
> quex -i tmp.qx --encoding utf8 --language C -o EasyLexer
> gcc -I$QUEX_PATH example.c EasyLexer.c -o example
> ./example example.txt
交付
ID 'Örjan'
WS '\n'
ID '日本語'
WS '\n'
假设文件“example.txt”是UTF-8编码的,并且包含
Örjan
日本語
我不知道该说些什么。我理解不对吗?