Mozilla/mozilla/js2/src/lexer.cpp

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 *
 * The contents of this file are subject to the Netscape Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express oqr
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is the JavaScript 2 Prototype.
 *
 * The Initial Developer of the Original Code is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation. All
 * Rights Reserved.
 *
 * Contributor(s):
 *
 * Alternatively, the contents of this file may be used under the
 * terms of the GNU Public License (the "GPL"), in which case the
 * provisions of the GPL are applicable instead of those above.
 * If you wish to allow use of your version of this file only
 * under the terms of the GPL and not to allow others to use your
 * version of this file under the NPL, indicate your decision by
 * deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL.  If you do not delete
 * the provisions above, a recipient may use your version of this
 * file under either the NPL or the GPL.
 */

#include "numerics.h"
#include "lexer.h"

namespace JavaScript
{

// Create a new Lexer for lexing the provided source code.  The Lexer will
// intern identifiers, keywords, and regular expressions in the designated
// world.
    Lexer::Lexer(World &world, const String &source,
                 const String &sourceLocation, uint32 initialLineNum):
            world(world), reader(source, sourceLocation, initialLineNum)
    {
        nextToken = tokens;
        nTokensFwd = 0;
#ifdef DEBUG
        nTokensBack = 0;
#endif
        lexingUnit = false;
    }


// Get and return the next token.  The token remains valid until the next
// call to this Lexer.  If the Reader reached the end of file, return a
// Token whose Kind is end.  The caller may alter the value of this Token
// (in particular, take control over the auto_ptr's data), but if it does so,
// the caller is not allowed to unget this Token.
//
// If preferRegExp is true, a / will be preferentially interpreted as
// starting a regular expression; otherwise, a / will be preferentially
// interpreted as division or /=.
    const Token
    &Lexer::get(bool preferRegExp)
    {
        const Token &t = peek(preferRegExp);
        if (++nextToken == tokens + tokenBufferSize)
            nextToken = tokens;
        --nTokensFwd;
        DEBUG_ONLY(++nTokensBack);
        return t;
    }


// Peek at the next token using the given preferRegExp setting.  If that
// token's kind matches the given kind, consume that token and return it.
// Otherwise, do not consume that token and return nil.
    const Token *
    Lexer::eat(bool preferRegExp, Token::Kind kind)
    {
        const Token &t = peek(preferRegExp);
        if (t.kind != kind)
            return 0;
        if (++nextToken == tokens + tokenBufferSize)
            nextToken = tokens;
        --nTokensFwd;
        DEBUG_ONLY(++nTokensBack);
        return &t;
    }


// Return the next token without consuming it.
//
// If preferRegExp is true, a / will be preferentially interpreted as
// starting a regular expression; otherwise, a / will be preferentially
// interpreted as division or /=. A subsequent call to peek or get will
// return the same token; that call must be presented with the same value
// for preferRegExp.
    const Token &
    Lexer::peek(bool preferRegExp)
    {
        // Use an already looked-up token if there is one.
        if (nTokensFwd) {
            ASSERT(savedPreferRegExp[nextToken - tokens] == preferRegExp);
        } else {
            lexToken(preferRegExp);
            nTokensFwd = 1;
#ifdef DEBUG
            savedPreferRegExp[nextToken - tokens] = preferRegExp;
            if (nTokensBack == tokenLookahead) {
                nTokensBack = tokenLookahead-1;
                if (tokenGuard)
                    (nextToken >= tokens+tokenLookahead ?
                     nextToken-tokenLookahead :
                     nextToken+tokenBufferSize-tokenLookahead)->valid = false;
            }
#endif
        }
        return *nextToken;
    }


#ifdef DEBUG
// Change the setting of preferRegExp for an already peeked token.
// The token must not be one for which that setting mattered.
//
// THIS IS A DANGEROUS FUNCTION!
// Use it only if you can be prove that the already peeked token does not
// start with a slash.
    void
    Lexer::redesignate(bool preferRegExp)
    {
        ASSERT(nTokensFwd);
        ASSERT(savedPreferRegExp[nextToken - tokens] != preferRegExp);
        ASSERT(!(nextToken->hasKind(Token::regExp) ||
                 nextToken->hasKind(Token::divide) ||
                 nextToken->hasKind(Token::divideEquals)));
        savedPreferRegExp[nextToken - tokens] = preferRegExp;
    }
#endif

// Unread the last token.  This call may be called to unread at most
// tokenBufferSize tokens at a time (where a peek also counts as temporarily
// reading and unreading one token). When a token that has been unread is
// peeked or read again, the same value must be passed in preferRegExp as for
// the first time that token was read or peeked.
    void
    Lexer::unget()
    {
        ASSERT(nTokensBack--);
        nTokensFwd++;
        if (nextToken == tokens)
            nextToken = tokens + tokenBufferSize;
        --nextToken;
    }

// Report a syntax error at the backUp-th last character read by the Reader.
// In other words, if backUp is 0, the error is at the next character to be
// read by the Reader; if backUp is 1, the error is at the last character
// read by the Reader, and so forth.
    void
    Lexer::syntaxError(const char *message, uint backUp)
    {
        reader.unget(backUp);
        reader.error(Exception::syntaxError, widenCString(message),
                     reader.getPos());
    }

// Get the next character from the reader, skipping any Unicode format-control
// (Cf) characters.
    inline char16
    Lexer::getChar()
    {
        char16 ch = reader.get();
        if (char16Value(ch) >= firstFormatChar)
            ch = internalGetChar(ch);
        return ch;
    }

// Helper for getChar()
    char16
    Lexer::internalGetChar(char16 ch)
    {
        while (isFormat(ch))
            ch = reader.get();
        return ch;
    }


// Peek the next character from the reader, skipping any Unicode
// format-control (Cf) characters, which are read and discarded.
    inline char16
    Lexer::peekChar()
    {
        char16 ch = reader.peek();
        if (char16Value(ch) >= firstFormatChar)
            ch = internalPeekChar(ch);
        return ch;
    }

// Helper for peekChar()
    char16
    Lexer::internalPeekChar(char16 ch)
    {
        while (isFormat(ch)) {
            reader.get();
            ch = reader.peek();
        }
        return ch;
    }

// Peek the next character from the reader, skipping any Unicode
// format-control (Cf) characters, which are read and discarded.  If the
// peeked character matches ch, read that character and return true;
// otherwise return false.  ch must not be null.
    bool
    Lexer::testChar(char16 ch)
    {
        ASSERT(ch);     // If ch were null, it could match the eof null.
        char16 ch2 = peekChar();
        if (ch == ch2) {
            reader.get();
            return true;
        }
        return false;
    }

// A backslash has been read.  Read the rest of the escape code.
// Return the interpreted escaped character.  Throw an exception if the
// escape is not valid. If unicodeOnly is true, allow only \uxxxx escapes.
    char16
    Lexer::lexEscape(bool unicodeOnly)
    {
        char16 ch = getChar();
        int nDigits;

        if (!unicodeOnly || ch == 'u')
            switch (ch) {
                case '0':
                    // Make sure that the next character isn't a digit.
                    ch = peekChar();
                    if (!isASCIIDecimalDigit(ch))
                        return 0x00;
                    // Point to the next character in the error message
                    getChar();
                    break;

                case 'b':
                    return 0x08;
                case 'f':
                    return 0x0C;
                case 'n':
                    return 0x0A;
                case 'r':
                    return 0x0D;
                case 't':
                    return 0x09;
                case 'v':
                    return 0x0B;

                case 'x':
                    nDigits = 2;
                    goto lexHex;
                case 'u':
                    nDigits = 4;
              lexHex:
                    {
                        uint32 n = 0;
                        while (nDigits--) {
                            ch = getChar();
                            uint digit;
                            if (!isASCIIHexDigit(ch, digit))
                                goto error;
                            n = (n << 4) | digit;
                        }
                        return static_cast<char16>(n);
                    }
                default:
                    if (!reader.getEof(ch)) {
                        CharInfo chi(ch);
                        if (!isAlphanumeric(chi) && !isLineBreak(chi))
                            return ch;
                    }
            }
      error:
        syntaxError("Bad escape code");
        return 0;
    }


// Read an identifier into s.  The initial value of s is ignored and cleared.
// Return true if an escape code has been encountered.
// If allowLeadingDigit is true, allow the first character of s to be a digit,
// just like any continuing identifier character.
    bool
    Lexer::lexIdentifier(String &s, bool allowLeadingDigit)
    {
        reader.beginRecording(s);
        bool hasEscape = false;

        while (true) {
            char16 ch = getChar();
            char16 ch2 = ch;
            if (ch == '\\') {
                ch2 = lexEscape(true);
                hasEscape = true;
            }
            CharInfo chi2(ch2);

            if (!(allowLeadingDigit ? isIdContinuing(chi2) :
                  isIdLeading(chi2))) {
                if (ch == '\\')
                    syntaxError("Identifier escape expands into "
                                "non-identifier character");
                else
                    reader.unget();
                break;
            }
            reader.recordChar(ch2);
            allowLeadingDigit = true;
        }
        reader.endRecording();
        return hasEscape;
    }


// Read a numeric literal into nextToken->chars and nextToken->value.
// Return true if the numeric literal is followed by a unit, but don't read
// the unit yet.
    bool
    Lexer::lexNumeral()
    {
        int hasDecimalPoint = 0;
        String &s = nextToken->chars;
        uint digit;

        reader.beginRecording(s);
        char16 ch = getChar();
        if (ch == '0') {
            reader.recordChar('0');
            ch = getChar();
            if ((ch&~0x20) == 'X') {
                uint32 pos = reader.getPos();
                char16 ch2 = getChar();
                if (isASCIIHexDigit(ch2, digit)) {
                    reader.recordChar(ch);
                    do {
                        reader.recordChar(ch2);
                        ch2 = getChar();
                    } while (isASCIIHexDigit(ch2, digit));
                    ch = ch2;
                } else
                    reader.setPos(pos);
                goto done;
            } else if (isASCIIDecimalDigit(ch)) {
                syntaxError("Numeric constant syntax error");
            }
        }
        while (isASCIIDecimalDigit(ch) || ch == '.' && !hasDecimalPoint++) {
            reader.recordChar(ch);
            ch = getChar();
        }
        if ((ch&~0x20) == 'E') {
            uint32 pos = reader.getPos();
            char16 ch2 = getChar();
            char16 sign = 0;
            if (ch2 == '+' || ch2 == '-') {
                sign = ch2;
                ch2 = getChar();
            }
            if (isASCIIDecimalDigit(ch2)) {
                reader.recordChar(ch);
                if (sign)
                    reader.recordChar(sign);
                do {
                    reader.recordChar(ch2);
                    ch2 = getChar();
                } while (isASCIIDecimalDigit(ch2));
                ch = ch2;
            } else
                reader.setPos(pos);
        }

      done:
        // At this point the reader is just past the character ch, which
        // is the first non-formatting character that is not part of the
        // number.
        reader.endRecording();
        const char16 *sBegin = s.data();
        const char16 *sEnd = sBegin + s.size();
        const char16 *numEnd;
        nextToken->value = stringToDouble(sBegin, sEnd, numEnd);
        ASSERT(numEnd == sEnd);
        reader.unget();
        ASSERT(ch == reader.peek());
        return isIdContinuing(ch) || ch == '\\';
    }


// Read a string literal into s.  The initial value of s is ignored and
// cleared. The opening quote has already been read into separator.
    void
    Lexer::lexString(String &s, char16 separator)
    {
        char16 ch;

        reader.beginRecording(s);
        while ((ch = reader.get()) != separator) {
            CharInfo chi(ch);
            if (!isFormat(chi)) {
                if (ch == '\\')
                    ch = lexEscape(false);
                else if (reader.getEof(ch) || isLineBreak(chi))
                    syntaxError("Unterminated string literal");
                reader.recordChar(ch);
            }
        }
        reader.endRecording();
    }


// Read a regular expression literal.  Store the regular expression in
// nextToken->id and the flags in nextToken->chars.
// The opening slash has already been read.
    void Lexer::lexRegExp()
    {
        String s;
        char16 prevCh = 0;

        reader.beginRecording(s);
        while (true) {
            char16 ch = getChar();
            CharInfo chi(ch);
            if (reader.getEof(ch) || isLineBreak(chi))
                syntaxError("Unterminated regular expression literal");
            if (prevCh == '\\') {
                reader.recordChar(ch);
                // Ignore slashes and backslashes immediately after a backslash
                prevCh = 0;
            } else if (ch != '/') {
                reader.recordChar(ch);
                prevCh = ch;
            } else
                break;
        }
        reader.endRecording();
        nextToken->id = &world.identifiers[s];

        lexIdentifier(nextToken->chars, true);
    }

// Read a token from the Reader and store it at *nextToken.
// If the Reader reached the end of file, store a Token whose Kind is end.
    void Lexer::lexToken(bool preferRegExp)
    {
        Token &t = *nextToken;
        t.lineBreak = false;
        t.id = 0;
        // Don't really need to waste time clearing this string here
        //clear(t.chars);
        Token::Kind kind;

        if (lexingUnit) {
            lexIdentifier(t.chars, false);
            ASSERT(t.chars.size());
            kind = Token::unit;   // unit
            lexingUnit = false;
        } else {
          next:
            char16 ch = reader.get();
            if (reader.getEof(ch)) {
              endOfInput:
                t.pos = reader.getPos() - 1;
                kind = Token::end;
            } else {
                char16 ch2;
                CharInfo chi(ch);

                switch (cGroup(chi)) {
                    case CharInfo::FormatGroup:
                    case CharInfo::WhiteGroup:
                        goto next;

                    case CharInfo::IdGroup:
                        t.pos = reader.getPos() - 1;
                  readIdentifier:
                        {
                            reader.unget();
                            String s;
                            bool hasEscape = lexIdentifier(s, false);
                            t.id = &world.identifiers[s];
                            kind = hasEscape ? Token::identifier :
                                t.id->tokenKind;
                        }
                        break;

                    case CharInfo::NonIdGroup:
                    case CharInfo::IdContinueGroup:
                        t.pos = reader.getPos() - 1;
                        switch (ch) {
                            case '(':
                                kind = Token::openParenthesis;  // (
                                break;
                            case ')':
                                kind = Token::closeParenthesis; // )
                                break;
                            case '[':
                                kind = Token::openBracket;      // [
                                break;
                            case ']':
                                kind = Token::closeBracket;     // ]
                                break;
                            case '{':
                                kind = Token::openBrace;        // {
                                break;
                            case '}':
                                kind = Token::closeBrace;       // }
                                break;
                            case ',':
                                kind = Token::comma;            // ,
                                break;
                            case ';':
                                kind = Token::semicolon;        // ;
                                break;
                            case '.':
                                kind = Token::dot;              // .
                                ch2 = getChar();
                                if (isASCIIDecimalDigit(ch2)) {
                                    reader.setPos(t.pos);
                                    goto number;               // decimal point
                                } else if (ch2 == '.') {
                                    kind = Token::doubleDot;    // ..
                                    if (testChar('.'))
                                        kind = Token::tripleDot;// ...
                                } else
                                    reader.unget();
                                break;
                            case ':':
                                kind = Token::colon;            // :
                                if (testChar(':'))
                                    kind = Token::doubleColon;  // ::
                                break;
                            case '#':
                                kind = Token::pound;            // #
                                break;
                            case '@':
                                kind = Token::at;               // @
                                break;
                            case '?':
                                kind = Token::question;         // ?
                                break;

                            case '~':
                                kind = Token::complement;       // ~
                                break;
                            case '!':
                                kind = Token::logicalNot;       // !
                                if (testChar('=')) {
                                    kind = Token::notEqual;     // !=
                                    if (testChar('='))
                                        kind = Token::notIdentical; // !==
                                }
                                break;

                            case '*':
                                kind = Token::times;            // * *=
                          tryAssignment:
                                if (testChar('='))
                                    kind = Token::Kind(kind +
                                                       Token::timesEquals -
                                                       Token::times);
                                break;

                            case '/':
                                kind = Token::divide;           // /
                                ch = getChar();
                                if (ch == '/') {                // // comment
                                    do {
                                        ch = reader.get();
                                        if (reader.getEof(ch))
                                            goto endOfInput;
                                    } while (!isLineBreak(ch));
                                    goto endOfLine;
                                } else if (ch == '*') {         // /*comment*/
                                    ch = 0;
                                    do {
                                        ch2 = ch;
                                        ch = getChar();
                                        if (isLineBreak(ch)) {
                                            reader.beginLine();
                                            t.lineBreak = true;
                                        } else if (reader.getEof(ch))
                                            syntaxError("Unterminated /* "
                                                        "comment");
                                    } while (ch != '/' || ch2 != '*');
                                    goto next;
                                } else {
                                    reader.unget();
                                    if (preferRegExp) {  // Regular expression
                                        kind = Token::regExp;
                                        lexRegExp();
                                    } else
                                        goto tryAssignment;    // /=
                                }
                                break;

                            case '%':
                                kind = Token::modulo;          // %
                                goto tryAssignment;            // %=

                            case '+':
                                kind = Token::plus;            // +
                                if (testChar('+'))
                                    kind = Token::increment;   // ++
                                else
                                    goto tryAssignment;        // +=
                                break;

                            case '-':
                                kind = Token::minus;           // -
                                ch = getChar();
                                if (ch == '-')
                                    kind = Token::decrement;   // --
                                else if (ch == '>')
                                    kind = Token::arrow;       // ->
                                else {
                                    reader.unget();
                                    goto tryAssignment;        // -=
                                }
                                break;

                            case '&':
                                kind = Token::bitwiseAnd;      // & && &= &&=
                          logical:
                                if (testChar(ch))
                                    kind = Token::Kind(kind -
                                                       Token::bitwiseAnd +
                                                       Token::logicalAnd);
                                goto tryAssignment;
                            case '^':
                                kind = Token::bitwiseXor;      // ^ ^^ ^= ^^=
                                goto logical;
                            case '|':
                                kind = Token::bitwiseOr;       // | || |= ||=
                                goto logical;

                            case '=':
                                kind = Token::assignment;      // =
                                if (testChar('=')) {
                                    kind = Token::equal;       // ==
                                    if (testChar('='))
                                        kind = Token::identical; // ===
                                }
                                break;

                            case '<':
                                kind = Token::lessThan;        // <
                                if (testChar('<')) {
                                    kind = Token::leftShift;   // <<
                                    goto tryAssignment;        // <<=
                                }
                          comparison:
                                if (testChar('='))             // <= >=
                                    kind = Token::Kind(kind +
                                                       Token::lessThanOrEqual -
                                                       Token::lessThan);
                                break;
                            case '>':
                                kind = Token::greaterThan;     // >
                                if (testChar('>')) {
                                    kind = Token::rightShift;  // >>
                                    if (testChar('>'))
                                        kind = Token::logicalRightShift; // >>>
                                    goto tryAssignment;        // >>= >>>=
                                }
                                goto comparison;

                            case '\\':
                                goto readIdentifier;  // An identifier that
                                                      // starts with an escape

                            case '\'':
                            case '"':
                                kind = Token::string; // 'string' "string"
                                lexString(t.chars, ch);
                                break;

                            case '0':
                            case '1':
                            case '2':
                            case '3':
                            case '4':
                            case '5':
                            case '6':
                            case '7':
                            case '8':
                            case '9':
                                reader.unget();       // Number
                          number:
                                kind = Token::number;
                                lexingUnit = lexNumeral();
                                break;

                            default:
                                syntaxError("Bad character");
                        }
                        break;

                    case CharInfo::LineBreakGroup:
                  endOfLine:
                  reader.beginLine();
                  t.lineBreak = true;
                  goto next;
                }
            }
        }
        t.kind = kind;
#ifdef DEBUG
        t.valid = true;
#endif
    }

}