Commit 70122b30 authored by Nikolai Kosjar's avatar Nikolai Kosjar

C++: Support for UTF-8 in the lexer

This will save us toLatin1() conversations in CppTools (which already
holds UTF-8 encoded QByteArrays) and thus loss of information (see
QTCREATORBUG-7356). It also gives us support for non-latin1 identifiers.

API-wise the following functions are added to Token. In follow-up
patches these will become handy in combination with QStrings.
    utf16chars() - aequivalent of bytes()
    utf16charsBegin() - aequivalent of bytesBegin()
    utf16charsEnd() - aequivalent of bytesEnd()

Next steps:
 * Adapt functions from TranslationUnit. They should work with utf16
   chars in order to calculate lines and columns correctly also for
   UTF-8 multi-byte code points.
 * Adapt the higher level clients:
    * Cpp{Tools,Editor} should expect UTF-8 encoded Literals.
    * Cpp{Tools,Editor}: When dealing with identifiers on the
      QString/QTextDocument layer, code points
      represendet by two QChars need to be respected, too.
 * Ensure Macro::offsets() and Document::MacroUse::{begin,end}() report
   offsets usable in CppEditor/CppTools.

Addresses QTCREATORBUG-7356.

Change-Id: I0791b5236be8215d24fb8e38a1f7cb0d279454c0
Reviewed-by: default avatarErik Verbruggen <erik.verbruggen@digia.com>
parent 4fefb1ca
......@@ -29,6 +29,13 @@
using namespace CPlusPlus;
/*!
\class Lexer
\brief The Lexer generates tokens from an UTF-8 encoded source text.
\sa Token
*/
Lexer::Lexer(TranslationUnit *unit)
: _translationUnit(unit),
_control(unit->control()),
......@@ -63,6 +70,7 @@ void Lexer::setSource(const char *firstChar, const char *lastChar)
_firstChar = firstChar;
_lastChar = lastChar;
_currentChar = _firstChar - 1;
_currentCharUtf16 = -1;
_tokenStart = _currentChar;
_yychar = '\n';
}
......@@ -109,6 +117,7 @@ void Lexer::scan(Token *tok)
tok->reset();
scan_helper(tok);
tok->f.bytes = _currentChar - _tokenStart;
tok->f.utf16chars = _currentCharUtf16 - _tokenStartUtf16;
}
void Lexer::scan_helper(Token *tok)
......@@ -143,6 +152,9 @@ void Lexer::scan_helper(Token *tok)
_tokenStart = _currentChar;
tok->byteOffset = _currentChar - _firstChar;
_tokenStartUtf16 = _currentCharUtf16;
tok->utf16charOffset = _currentCharUtf16;
if (_yychar) {
s._newlineExpected = false;
} else if (s._tokenKind) {
......@@ -621,8 +633,8 @@ void Lexer::scan_helper(Token *tok)
} else {
scanIdentifier(tok);
}
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
scanIdentifier(tok);
} else if (std::isalpha(ch) || ch == '_' || ch == '$' || isByteOfMultiByteCodePoint(ch)) {
scanIdentifier(tok, _currentChar - _tokenStart - 1);
} else if (std::isdigit(ch)) {
scanNumericLiteral(tok);
} else {
......@@ -776,8 +788,10 @@ void Lexer::scanNumericLiteral(Token *tok)
void Lexer::scanIdentifier(Token *tok, unsigned extraProcessedChars)
{
const char *yytext = _currentChar - 1 - extraProcessedChars;
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$'
|| isByteOfMultiByteCodePoint(_yychar)) {
yyinp();
}
int yylen = _currentChar - yytext;
if (f._scanKeywords)
tok->f.kind = classify(yytext, yylen, _languageFeatures);
......
......@@ -62,6 +62,7 @@ public:
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
private:
void pushLineStartOffset();
void scan_helper(Token *tok);
void setSource(const char *firstChar, const char *lastChar);
static int classify(const char *string, int length, LanguageFeatures features);
......@@ -77,15 +78,32 @@ private:
void scanBackslash(Kind type);
void scanCppComment(Kind type);
inline void yyinp()
static bool isByteOfMultiByteCodePoint(unsigned char byte)
{ return byte & 0x80; } // Check if most significant bit is set
void yyinp()
{
_yychar = *++_currentChar;
++_currentCharUtf16;
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF16 code units
if (trailingBytesCurrentCodePoint >= 3)
++_currentCharUtf16;
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
_yychar = *++_currentChar;
}
if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset();
}
void pushLineStartOffset();
private:
struct Flags {
unsigned _scanCommentTokens: 1;
......@@ -105,6 +123,10 @@ private:
const char *_lastChar;
const char *_tokenStart;
unsigned char _yychar;
unsigned _currentCharUtf16;
unsigned _tokenStartUtf16;
union {
unsigned char _state;
State s;
......@@ -113,6 +135,7 @@ private:
unsigned _flags;
Flags f;
};
unsigned _currentLine;
LanguageFeatures _languageFeatures;
};
......
......@@ -85,6 +85,7 @@ void Token::reset()
{
flags = 0;
byteOffset = 0;
utf16charOffset = 0;
ptr = 0;
}
......
......@@ -285,7 +285,7 @@ enum Kind {
class CPLUSPLUS_EXPORT Token
{
public:
Token() : flags(0), byteOffset(0), ptr(0) {}
Token() : flags(0), byteOffset(0), utf16charOffset(0), ptr(0) {}
inline bool is(unsigned k) const { return f.kind == k; }
inline bool isNot(unsigned k) const { return f.kind != k; }
......@@ -298,13 +298,14 @@ public:
inline bool joined() const { return f.joined; }
inline bool expanded() const { return f.expanded; }
inline bool generated() const { return f.generated; }
inline unsigned bytes() const { return f.bytes; }
inline unsigned bytesBegin() const
{ return byteOffset; }
inline unsigned bytes() const { return f.bytes; }
inline unsigned bytesBegin() const { return byteOffset; }
inline unsigned bytesEnd() const { return byteOffset + f.bytes; }
inline unsigned bytesEnd() const
{ return byteOffset + f.bytes; }
inline unsigned utf16chars() const { return f.utf16chars; }
inline unsigned utf16charsBegin() const { return utf16charOffset; }
inline unsigned utf16charsEnd() const { return utf16charOffset + f.utf16chars; }
inline bool isLiteral() const
{ return f.kind >= T_FIRST_LITERAL && f.kind <= T_LAST_LITERAL; }
......@@ -354,15 +355,17 @@ public:
unsigned generated : 1;
// Unused...
unsigned pad : 3;
// The token length in bytes.
// The token length in bytes and UTF16 chars.
unsigned bytes : 16;
unsigned utf16chars : 16;
};
union {
unsigned flags;
unsigned long flags;
Flags f;
};
unsigned byteOffset;
unsigned utf16charOffset;
union {
void *ptr;
......@@ -393,5 +396,4 @@ struct LanguageFeatures
} // namespace CPlusPlus
#endif // CPLUSPLUS_TOKEN_H
......@@ -61,11 +61,11 @@ bool SimpleLexer::endedJoined() const
return _endedJoined;
}
QList<Token> SimpleLexer::operator()(const QString &text, int state)
QList<Token> SimpleLexer::operator()(const QString &text, int state, bool convertToUtf8)
{
QList<Token> tokens;
const QByteArray bytes = text.toLatin1();
const QByteArray bytes = convertToUtf8 ? text.toUtf8() : text.toLatin1();
const char *firstChar = bytes.constData();
const char *lastChar = firstChar + bytes.size();
......
......@@ -54,7 +54,7 @@ public:
bool endedJoined() const;
QList<Token> operator()(const QString &text, int state = 0);
QList<Token> operator()(const QString &text, int state = 0, bool convertToUtf8 = false);
int state() const
{ return _lastState; }
......
......@@ -12,4 +12,5 @@ SUBDIRS = \
misc \
cxx11 \
checksymbols \
lexer
lexer \
translationunit
......@@ -13,6 +13,7 @@ Project {
"misc/misc.qbs",
"preprocessor/preprocessor.qbs",
"semantic/semantic.qbs",
"translationunit/translationunit.qbs",
"typeprettyprinter/typeprettyprinter.qbs"
]
}
This diff is collapsed.
include(../shared/shared.pri)
SOURCES += tst_translationunit.cpp
import qbs
import "../cplusplusautotest.qbs" as CPlusPlusAutotest
CPlusPlusAutotest {
name: "CPlusPlus translation unit autotest"
files: "tst_translationunit.cpp"
}
/****************************************************************************
**
** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
** Contact: http://www.qt-project.org/legal
**
** This file is part of Qt Creator.
**
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and Digia. For licensing terms and
** conditions see http://qt.digia.com/licensing. For further information
** use the contact form at http://qt.digia.com/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 as published by the Free Software
** Foundation and appearing in the file LICENSE.LGPL included in the
** packaging of this file. Please review the following information to
** ensure the GNU Lesser General Public License version 2.1 requirements
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** In addition, as a special exception, Digia gives you certain additional
** rights. These rights are described in the Digia Qt LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
****************************************************************************/
#include <cplusplus/PreprocessorClient.h>
#include <cplusplus/PreprocessorEnvironment.h>
#include <cplusplus/Token.h>
#include <cplusplus/TranslationUnit.h>
#include <cplusplus/pp-engine.h>
#include <QtTest>
#include <QDebug>
//TESTED_COMPONENT=src/libs/cplusplus
using namespace CPlusPlus;
class tst_TranslationUnit: public QObject
{
Q_OBJECT
private slots:
//
// The following "non-latin1" code points are used in the tests following this comment:
//
// U+00FC - 2 code units in UTF8, 1 in UTF16 - LATIN SMALL LETTER U WITH DIAERESIS
// U+4E8C - 3 code units in UTF8, 1 in UTF16 - CJK UNIFIED IDEOGRAPH-4E8C
// U+10302 - 4 code units in UTF8, 2 in UTF16 - OLD ITALIC LETTER KE
//
void unicodeIdentifier();
void unicodeIdentifier_data();
void unicodeStringLiteral();
void unicodeStringLiteral_data();
private:
class Document
{
public:
typedef QSharedPointer<Document> Ptr;
static Document::Ptr create(const QByteArray &source)
{
LanguageFeatures features;
features.objCEnabled = true;
features.qtEnabled = false;
features.qtKeywordsEnabled = false;
features.qtMocRunEnabled = false;
Document::Ptr document = Document::Ptr(new Document);
document->translationUnit()->setLanguageFeatures(features);
const QByteArray preprocessedSource = preprocess(source);
document->translationUnit()->setSource(preprocessedSource.constData(),
preprocessedSource.length());
document->translationUnit()->parse();
if (document->hasParsingErrors())
return Document::Ptr();
return document;
}
public:
Document()
: m_translationUnit(&m_control, m_control.stringLiteral("testFile"))
{
m_control.setDiagnosticClient(&m_diagnosticClient);
}
TranslationUnit *translationUnit()
{ return &m_translationUnit; }
bool hasParsingErrors() const
{ return m_diagnosticClient.errorCount != 0; }
const Identifier *lastIdentifier() const
{ return *(m_control.lastIdentifier() - 1); }
const StringLiteral *lastStringLiteral() const
{ return *(m_control.lastStringLiteral() - 1); }
private:
static QByteArray preprocess(const QByteArray &source)
{
Client *client = 0; // no client.
Environment env;
Preprocessor preprocess(client, &env);
preprocess.setKeepComments(true);
return preprocess.run(QLatin1String("<stdin>"), source);
}
private:
Control m_control;
TranslationUnit m_translationUnit;
class Diagnostic: public DiagnosticClient {
public:
int errorCount;
Diagnostic() : errorCount(0) {}
void report(int /*level*/, const StringLiteral *fileName, unsigned line,
unsigned column, const char *format, va_list ap)
{
++errorCount;
qDebug() << fileName->chars() << ':' << line << ':' << column
<< ' ' << QString().vsprintf(format, ap);
}
} m_diagnosticClient;
};
};
void tst_TranslationUnit::unicodeIdentifier()
{
QFETCH(QByteArray, identifierText);
Document::Ptr document = Document::create("void " + identifierText + ";");
QVERIFY(document);
const Identifier *actual = document->lastIdentifier();
QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
QString::fromUtf8(identifierText));
}
void tst_TranslationUnit::unicodeIdentifier_data()
{
QTest::addColumn<QByteArray>("identifierText");
typedef QByteArray _;
QTest::newRow("latin1 identifier") << _("var");
QTest::newRow("non-latin1 identifier 1") << _("prefix\u00FC\u4E8C\U00010302");
QTest::newRow("non-latin1 identifier 2") << _("prefix\U00010302\u00FC\u4E8C");
QTest::newRow("non-latin1 identifier 3") << _("\U00010302\u00FC\u4E8C");
QTest::newRow("non-latin1 identifier 4") << _("\u4E8C\U00010302\u00FC");
QTest::newRow("non-latin1 identifier 5") << _("\u4E8C\U00010302\u00FCsuffix");
QTest::newRow("non-latin1 identifier 6") << _("\U00010302\u00FC\u4E8Csuffix");
// Some special cases (different code path inside lexer)
QTest::newRow("non-latin1 identifier 7") << _("LR\U00010302\u00FC\u4E8C");
QTest::newRow("non-latin1 identifier 8") << _("u8R\U00010302\u00FC\u4E8C");
QTest::newRow("non-latin1 identifier 9") << _("u8\U00010302\u00FC\u4E8C");
QTest::newRow("non-latin1 identifier 10") << _("u\U00010302\u00FC\u4E8C");
}
static QByteArray stripQuotesFromLiteral(const QByteArray literal)
{
QByteArray result = literal;
// Strip front
while (!result.isEmpty() && result[0] != '"')
result = result.mid(1);
if (result.isEmpty())
return QByteArray();
result = result.mid(1);
// Strip end
while (result.size() >= 2
&& (std::isspace(result[result.size() - 1]) || result[result.size()-1] == '"')) {
result.chop(1);
}
return result;
}
void tst_TranslationUnit::unicodeStringLiteral()
{
QFETCH(QByteArray, literalText);
Document::Ptr document = Document::create("char t[] = " + literalText + ";");
QVERIFY(document);
const StringLiteral *actual = document->lastStringLiteral();
QCOMPARE(QString::fromUtf8(actual->chars(), actual->size()),
QString::fromUtf8(stripQuotesFromLiteral(literalText)));
}
void tst_TranslationUnit::unicodeStringLiteral_data()
{
QTest::addColumn<QByteArray>("literalText");
typedef QByteArray _;
QTest::newRow("latin1 literal") << _("\"var\"");
QTest::newRow("non-latin1 literal 1") << _("\"prefix\u00FC\u4E8C\U00010302\"");
QTest::newRow("non-latin1 literal 2") << _("\"prefix\U00010302\u00FC\u4E8C\"");
QTest::newRow("non-latin1 literal 3") << _("\"\U00010302\u00FC\u4E8C\"");
QTest::newRow("non-latin1 literal 4") << _("\"\u4E8C\U00010302\u00FC\"");
QTest::newRow("non-latin1 literal 5") << _("\"\u4E8C\U00010302\u00FCsuffix\"");
QTest::newRow("non-latin1 literal 6") << _("\"\U00010302\u00FC\u4E8Csuffix\"");
QTest::newRow("non-latin1 literal 7") << _("L\"\U00010302\u00FC\u4E8C\"");
QTest::newRow("non-latin1 literal 8") << _("u8\"\U00010302\u00FC\u4E8C\"");
QTest::newRow("non-latin1 literal 9") << _("u\"\U00010302\u00FC\u4E8C\"");
QTest::newRow("non-latin1 literal 10") << _("U\"\U00010302\u00FC\u4E8C\"");
}
QTEST_APPLESS_MAIN(tst_TranslationUnit)
#include "tst_translationunit.moc"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment