javascript - 读取代码点时出现偏移量问题

简历：我目前正在写一个ActionScript 3词法分析器，它将源代码转换成令牌。我选择通过代码点来解释输入，即一个字符串，该字符串具有包裹在类UString中的可选代理对。在后台，我使用UStringPos类缓存了最后的读取位置。

我已经测试了它如何用...扫描标识符"huehuehue"

'use strict';

import {Lexer}      from 'core/Lexer';
import {UString}    from 'utils/UString';
import ErrorHandler from 'core/ErrorHandler';

const errorHandler = new ErrorHandler(true);

// Tell the length to the `Lexer` manually.
const lexer = new Lexer(
  new UString('huehuehue'), 9, errorHandler);

// Scan first token
lexer.next();

const id = lexer.lookahead.value;

console.log(
    id,
    id.length
);

它应该已经记录了"huehuehue", 9，但这是另一个故事...

为什么缺少最后一个'e'？与扫描有关的最里面的方法是Lexer#getCommonIdentifier。顺便说一句，我已经测试过我的UString部分，它可以正常工作。

Lexer相关定义

/*
 * Class that turns AS3 code into tokens.
 */
export class Lexer
{
  /*
   * @param {UString} source
   * @param {Number} length
   * @param {ErrorHandler} errorHandler
   */
  constructor(source, length, errorHandler)
  {
    this.source = source;
    this.length = length;
    this.index = 0;
    this.lineStart = 0;
    this.lineNumber = 1;
    this.comments = [];

    this.errorHandler = errorHandler;

    this.previousToken = null;
    this.token         = null;
    this.lookahead     = null;

    this._special = [];
  }

  /*
   * Verifies the end of file.
   */
  eof()
  {
    return this.index >= this.length;
  }

  /*
   * Advance the previous, current and lookahead tokens.
   * The lexer however does not depend on these tokens.
   */
  next()
  {
    this.previousToken = this.token;
    this.token         = this.lookahead;
    this.lookahead     = this.lex();
  }

  /*
   * Consumes the next token and return it.
   */
  lex()
  {
    this.consumeWhiteSpaces();

    while (this.consumeComment())
      this.consumeWhiteSpaces();

    let cp = this.source.codePointAt(this.index);

    let pureIdentifier =
      Character.isIdentifierStart(cp);

    if (pureIdentifier || (cp === 0x5C))
      return this.scanIdentifierOrKeyword(!pureIdentifier);

    if (this.eof())
    {
      let loc = [ this.index, this.lineNumber ];
      return new Token(TokenType.EOF, loc, loc, '<end>');
    }
  }

  /*
   * Scan an identifier, keyword or boolean literal.
   */
  scanIdentifierOrKeyword(usingEscape)
  {
    const start = this.index;
    let id;

    /* Like Esprima does: only identifiers containing
     * escapes need some overheads. */
    if (usingEscape)
    {
      id = this.getEscapedIdentifier(
        String.fromCodePoint(this.scanUnicodeEscapeSequence()));
    }
    else
      id = this.getCommonIdentifier();

    return new Token(
      TokenType.IDENTIFIER,
      [ start     , this.lineNumber ],
      [ this.index, this.lineNumber ],
      id
    );
  }

  /*
   * Interprets an identifier. If any escape appears, switches to
   * getEscapedIdentifier().
   */
  getCommonIdentifier()
  {
    const start = this.source.position.offset;
    let cp = 0;

    // Jump the starting symbol.
    ++this.index;

    while (!this.eof())
    {
      cp = this.source.codePointAt(this.index);

      if (Character.isIdentifierPart(cp))
        ++this.index;

      // Switches to escape-minded task...
      else if (cp === 0x5C)
        return this.getUnicodeEscapedIdentifier(
          this.source.string.slice(
            start, this.source.position.offset
          )
        );

      else break;
    }
    return this.source.string.slice(
      start, this.source.position.offset
    );
  }

  /* ... */
}

utils / UString.js

'use strict';

/*
 * String wrapper with methods _based_ on code points.
 */
export class UString
{
  /*
   * Constructs the {UString}.
   *
   * @param {String} s String to be wrapped.
   */
  constructor(s)
  {
    /*
     * @type {String}
     */
    this.string = s;

    /*
     * Tracks the last accessed position.
     *
     * @type {UStringPos}
     */
    this.position = new UStringPos(0, 0);
  }

  /*
   * Reads a code point at specific index.
   *
   * @param {Number} index
   * @return {Number}
   */
  codePointAt(index)
  {
    this.position.walk(this.string, index);
    return this.string.codePointAt(this.position.offset);
  }

  /*
   * Slices the internal string by code point indices.
   *
   * @param {Number} i
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, i);
    i = this.position.offset;

    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }
};

/*
 * Class that tracks the position of a code point on a string.
 */
export class UStringPos
{
  /*
   * Constructs the {UStringPos}.
   *
   * @param {Number} index The initial index.
   * @param {Number} offset The initial offset.
   */
  constructor(index, offset)
  {
    /*
     * @type {Number}
     */
    this.index = index;

    /*
     * @type {Number}
     */
    this.offset = offset;
  }

  /*
   * Walks to the given index.
   *
   * @param {String} s
   * @param {Number} index
   * @note No backward. Track the previous position instead.
   * @return {void}
   */
  walk(s, index)
  {
    for (; this.index < index; ++this.index)
      this.offset += (
        this._usingSurrogates(
          s.charCodeAt(this.offset)
        ) ? 2 : 1
      );
  }

  /*
   * @private
   */
  _usingSurrogates(ch)
  {
    return (ch >= 0xD800) && (ch <= 0xDBFF);
  }
};

有什么事吗

最佳答案

好的。因此，这是this.source.position.offset的问题：当我执行++this.index时，我的UStringPos的偏移量不会更新。问题出在切片上。

    this.source.string.slice(
      start, this.source.position.offset
    );

该切片基于偏移量，因为我必须跟踪标识符开始的先前偏移量。

解

我可以使用自己的UString类的切片，并将第一个参数用作偏移量，最后一个参数用作普通索引。

'use strict';

export class UString
{
  // ...

  /*
   * Slices the internal string by using a pair of
   * offset and code point indices.
   *
   * @param {Number} i Offset
   * @param {Number} j
   * @return {String}
   */
  slice(i, j)
  {
    this.position.walk(this.string, j);
    j = this.position.offset;

    return this.string.slice(i, j);
  }

};