Commit 242b3f41 authored by Erik Verbruggen's avatar Erik Verbruggen Committed by Nikolai Kosjar
Browse files

C++: clean up numeric literal parsing and add support for n3472.



Separate the messy pp-number parsing from the numeric literal parsing.
The C/C++ preprocessor makes a grown man cry, but at least we have
"proper" literal parsing when we want it, including C++1y binary
literals.

Next step is digit separators (n3781).

Change-Id: Ia069eef454ed5c056f77694a5b8a595d0b76adc4
Reviewed-by: default avatarErik Verbruggen <erik.verbruggen@theqtcompany.com>
parent 16becbd2
......@@ -305,24 +305,27 @@ void Lexer::scan_helper(Token *tok)
tok->f.kind = T_ERROR;
}
} else if (std::isdigit(_yychar)) {
if (f._ppMode) {
scanPreprocessorNumber(tok, true);
break;
}
const char *yytext = _currentChar - 2;
do {
if (_yychar == 'e' || _yychar == 'E') {
yyinp();
if (_yychar == '-' || _yychar == '+') {
yyinp();
// ### CPP_CHECK(std::isdigit(_yychar));
}
} else if (std::isalnum(_yychar) || _yychar == '.') {
yyinp();
scanDigitSequence(); // this is optional: we already skipped over the first digit
scanExponentPart();
scanOptionalFloatingSuffix();
if (std::isalnum(_yychar) || _yychar == '_') {
do {
yyinp();
} else {
break;
}
} while (_yychar);
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
} while (std::isalnum(_yychar) || _yychar == '_');
tok->f.kind = T_ERROR;
} else {
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
}
} else {
tok->f.kind = T_DOT;
}
......@@ -651,7 +654,10 @@ void Lexer::scan_helper(Token *tok)
} else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
scanIdentifier(tok, _currentChar - _tokenStart - 1);
} else if (std::isdigit(ch)) {
scanNumericLiteral(tok);
if (f._ppMode)
scanPreprocessorNumber(tok, false);
else
scanNumericLiteral(tok);
} else {
tok->f.kind = T_ERROR;
}
......@@ -776,26 +782,141 @@ void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
tok->string = control()->stringLiteral(yytext, yylen);
}
bool Lexer::scanDigitSequence()
{
if (!std::isdigit(_yychar))
return false;
yyinp();
while (std::isdigit(_yychar))
yyinp();
return true;
}
bool Lexer::scanExponentPart()
{
if (_yychar != 'e' && _yychar != 'E')
return false;
yyinp();
if (_yychar == '+' || _yychar == '-')
yyinp();
return scanDigitSequence();
}
void Lexer::scanOptionalFloatingSuffix()
{
if (_yychar == 'f' || _yychar == 'l' || _yychar == 'F' || _yychar == 'L')
yyinp();
}
void Lexer::scanOptionalIntegerSuffix(bool allowU)
{
switch(_yychar) {
case 'u':
case 'U':
if (allowU) {
yyinp();
scanOptionalIntegerSuffix(false);
}
return;
case 'l':
yyinp();
if (_yychar == 'l')
yyinp();
return;
case 'L':
yyinp();
if (_yychar == 'L')
yyinp();
return;
default:
return;
}
}
void Lexer::scanNumericLiteral(Token *tok)
{
const char *yytext = _currentChar - 1;
if (*yytext == '0' && _yychar) {
if (_yychar == 'x' || _yychar == 'X') {
yyinp();
while (std::isdigit(_yychar) ||
(_yychar >= 'a' && _yychar <= 'f') ||
(_yychar >= 'A' && _yychar <= 'F')) {
yyinp();
}
scanOptionalIntegerSuffix();
goto theEnd;
} else if (_yychar == 'b' || _yychar == 'B') { // see n3472
yyinp();
while (_yychar == '0' || _yychar == '1')
yyinp();
scanOptionalIntegerSuffix();
goto theEnd;
} else if (_yychar >= '0' && _yychar <= '7') {
do {
yyinp();
} while (_yychar >= '0' && _yychar <= '7');
scanOptionalIntegerSuffix();
goto theEnd;
}
}
while (_yychar) {
if (_yychar == '.') {
yyinp();
scanDigitSequence(); // this is optional: "1." is a valid floating point number
scanExponentPart();
scanOptionalFloatingSuffix();
break;
} else if (_yychar == 'e' || _yychar == 'E') {
if (scanExponentPart())
scanOptionalFloatingSuffix();
break;
} else if (std::isdigit(_yychar)) {
yyinp();
} else {
scanOptionalIntegerSuffix();
break;
}
}
theEnd:
if (std::isalnum(_yychar) || _yychar == '_') {
do {
yyinp();
} while (std::isalnum(_yychar) || _yychar == '_');
tok->f.kind = T_ERROR;
} else {
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
}
}
void Lexer::scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped)
{
const char *yytext = _currentChar - (dotAlreadySkipped ? 2 : 1);
if (dotAlreadySkipped &&
(!_yychar || (_yychar && !std::isdigit(_yychar)))) {
tok->f.kind = T_DOT;
return;
}
while (_yychar) {
if (_yychar == 'e' || _yychar == 'E') {
yyinp();
if (_yychar == '-' || _yychar == '+') {
if (_yychar == '+' || _yychar == '-')
yyinp();
// ### CPP_CHECK(std::isdigit(_yychar));
}
} else if (std::isalnum(_yychar) || _yychar == '.') {
} else if (std::isalnum(_yychar) || _yychar == '_' || _yychar == '.') {
yyinp();
} else {
break;
}
}
int yylen = _currentChar - yytext;
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
}
......
......@@ -61,6 +61,9 @@ public:
LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
void setPreprocessorMode(bool onoff)
{ f._ppMode = onoff; }
public:
static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
unsigned &utf16charCounter)
......@@ -95,7 +98,12 @@ private:
void scanRawStringLiteral(Token *tok, unsigned char hint = 0);
void scanCharLiteral(Token *tok, unsigned char hint = 0);
void scanUntilQuote(Token *tok, unsigned char quote);
bool scanDigitSequence();
bool scanExponentPart();
void scanOptionalFloatingSuffix();
void scanOptionalIntegerSuffix(bool allowU = true);
void scanNumericLiteral(Token *tok);
void scanPreprocessorNumber(Token *tok, bool dotAlreadySkipped);
void scanIdentifier(Token *tok, unsigned extraProcessedChars = 0);
void scanBackslash(Kind type);
void scanCppComment(Kind type);
......@@ -115,6 +123,7 @@ private:
unsigned _scanCommentTokens: 1;
unsigned _scanKeywords: 1;
unsigned _scanAngleStringLiteralTokens: 1;
unsigned _ppMode: 1;
};
struct State {
......
......@@ -41,7 +41,8 @@ using namespace CPlusPlus;
SimpleLexer::SimpleLexer()
: _lastState(0),
_skipComments(false),
_endedJoined(false)
_endedJoined(false),
_ppMode(false)
{}
SimpleLexer::~SimpleLexer()
......@@ -73,6 +74,7 @@ Tokens SimpleLexer::operator()(const QString &text, int state)
Lexer lex(firstChar, lastChar);
lex.setLanguageFeatures(_languageFeatures);
lex.setStartWithNewline(true);
lex.setPreprocessorMode(_ppMode);
if (! _skipComments)
lex.setScanCommentTokens(true);
......
......@@ -51,6 +51,9 @@ public:
bool skipComments() const;
void setSkipComments(bool skipComments);
void setPreprocessorMode(bool ppMode)
{ _ppMode = ppMode; }
LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
......@@ -74,6 +77,7 @@ private:
LanguageFeatures _languageFeatures;
bool _skipComments: 1;
bool _endedJoined: 1;
bool _ppMode: 1;
};
} // namespace CPlusPlus
......
......@@ -401,6 +401,9 @@ protected:
const char *end = spell + len;
char *vend = const_cast<char *>(end);
_value.set_long(strtol(spell, &vend, 0));
// TODO: if (vend != end) error(NaN)
// TODO: binary literals
// TODO: float literals
++(*_lex);
} else if (isTokenDefined()) {
++(*_lex);
......@@ -1388,6 +1391,7 @@ void Preprocessor::preprocess(const QString &fileName, const QByteArray &source,
m_state.m_lexer = new Lexer(source.constBegin(), source.constEnd());
m_state.m_lexer->setScanKeywords(false);
m_state.m_lexer->setScanAngleStringLiteralTokens(false);
m_state.m_lexer->setPreprocessorMode(true);
if (m_keepComments)
m_state.m_lexer->setScanCommentTokens(true);
m_state.m_result = result;
......@@ -1803,6 +1807,7 @@ const PPToken Preprocessor::evalExpression(PPToken *tk, Value &result)
PPToken lastConditionToken;
const QByteArray expanded = expand(tk, &lastConditionToken);
Lexer lexer(expanded.constData(), expanded.constData() + expanded.size());
lexer.setPreprocessorMode(true);
std::vector<Token> buf;
Token t;
do {
......
......@@ -70,6 +70,10 @@ private slots:
void basic_data();
void incremental();
void incremental_data();
void literals();
void literals_data();
void preprocessor();
void preprocessor_data();
void bytes_and_utf16chars();
void bytes_and_utf16chars_data();
......@@ -82,7 +86,8 @@ private:
void run(const QByteArray &source,
const Tokens &expectedTokens,
bool preserveState,
TokenCompareFlags compareFlags);
TokenCompareFlags compareFlags,
bool preprocessorMode = false);
int _state;
};
......@@ -103,11 +108,13 @@ Tokens tst_SimpleLexer::toTokens(const TokenKindList &tokenKinds)
void tst_SimpleLexer::run(const QByteArray &source,
const Tokens &expectedTokens,
bool preserveState,
TokenCompareFlags compareFlags)
TokenCompareFlags compareFlags,
bool preprocessorMode)
{
QVERIFY(compareFlags);
SimpleLexer lexer;
lexer.setPreprocessorMode(preprocessorMode);
const Tokens tokens = lexer(source, preserveState ? _state : 0);
if (preserveState)
_state = lexer.state();
......@@ -140,7 +147,10 @@ void tst_SimpleLexer::run(const QByteArray &source,
if (compareFlags & CompareUtf16CharsEnd)
QCOMPARE(token.utf16charsEnd(), expectedToken.utf16charsEnd());
}
QVERIFY2(i == expectedTokens.size(), "Less tokens than expected.");
QString msg = QLatin1String("Less tokens than expected: got %1, expected %2.");
msg = msg.arg(i).arg(expectedTokens.size());
QVERIFY2(i == expectedTokens.size(), msg.toUtf8().constData());
}
void tst_SimpleLexer::basic()
......@@ -254,6 +264,97 @@ void tst_SimpleLexer::basic_data()
QTest::newRow(source) << source << expectedTokenKindList;
}
void tst_SimpleLexer::literals()
{
QFETCH(QByteArray, source);
QFETCH(TokenKindList, expectedTokenKindList);
run(source, toTokens(expectedTokenKindList), false, CompareKind);
}
void tst_SimpleLexer::literals_data()
{
QTest::addColumn<QByteArray>("source");
QTest::addColumn<TokenKindList>("expectedTokenKindList");
QByteArray source;
TokenKindList expectedTokenKindList;
source =
"1.\n"
"1.1\n"
"1.23456789\n"
".1\n"
".3e8\n"
".3e8f\n"
"1e1\n"
"1E1\n"
"-1e-1\n" // the first minus sign is a separate token!
"1e-1\n"
"1e+1\n"
"1e1L\n"
"1e1l\n"
"1e1f\n"
"1e1F\n"
"23.45x"
".45x"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_MINUS
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_ERROR << T_ERROR
;
QTest::newRow("float-literals") << source << expectedTokenKindList;
source = // these are all the same
"42\n"
"0b101010u\n"
"052ll\n"
"0x2aL\n"
"123FOO\n"
"0xfOo\n"
"33_\n"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_ERROR << T_ERROR << T_ERROR
;
QTest::newRow("integer-literals") << source << expectedTokenKindList;
}
void tst_SimpleLexer::preprocessor()
{
QFETCH(QByteArray, source);
QFETCH(TokenKindList, expectedTokenKindList);
run(source, toTokens(expectedTokenKindList), false, CompareKind, true);
}
void tst_SimpleLexer::preprocessor_data()
{
QTest::addColumn<QByteArray>("source");
QTest::addColumn<TokenKindList>("expectedTokenKindList");
QByteArray source;
TokenKindList expectedTokenKindList;
source = // sad but true [2.10]
"1\n"
"1x.\n"
"1.y\n"
".1_1.1.\n"
"1e-\n"
"01x1b2qWeRtty_Grumble+E-.\n"
;
expectedTokenKindList =
TokenKindList() << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL
<< T_NUMERIC_LITERAL << T_NUMERIC_LITERAL << T_NUMERIC_LITERAL;
QTest::newRow("pp-number") << source << expectedTokenKindList;
}
void tst_SimpleLexer::bytes_and_utf16chars()
{
QFETCH(QByteArray, source);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment