Commit c6358e5d authored by Nikolai Kosjar's avatar Nikolai Kosjar
Browse files

C++: Add utf16 indices to Macro and Document::MacroUse



In most cases we need to work with the utf16 indices. Only in
cppfindreferences the byte interface is still needed since there we read
in files and work on a QByteArray to save memory.

Change-Id: I6ef6a93fc1875a8c9a305c075d51a9ca034c41bb
Reviewed-by: default avatarErik Verbruggen <erik.verbruggen@digia.com>
parent bb7da966
...@@ -36,6 +36,21 @@ using namespace CPlusPlus; ...@@ -36,6 +36,21 @@ using namespace CPlusPlus;
\sa Token \sa Token
*/ */
/*!
\fn static void Lexer::yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar, unsigned &utf16charCounter)
Process a single unicode code point in an UTF-8 encoded source.
\a currentSourceChar points to the UTF-8 encoded source.
\a yychar must be the byte pointed to by \a currentSourceChar.
Points \a currentSourceChar to the byte of the next code point
and modifies \a yychar to the value pointed by the updated
\a currentSourceChar. \a utf16charCounter will be incremented by
the number of UTF-16 code units that were needed for that code
point.
*/
Lexer::Lexer(TranslationUnit *unit) Lexer::Lexer(TranslationUnit *unit)
: _translationUnit(unit), : _translationUnit(unit),
_control(unit->control()), _control(unit->control()),
......
...@@ -61,6 +61,28 @@ public: ...@@ -61,6 +61,28 @@ public:
LanguageFeatures languageFeatures() const { return _languageFeatures; } LanguageFeatures languageFeatures() const { return _languageFeatures; }
void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; } void setLanguageFeatures(LanguageFeatures features) { _languageFeatures = features; }
public:
static void yyinp_utf8(const char *&currentSourceChar, unsigned char &yychar,
unsigned &utf16charCounter)
{
++utf16charCounter;
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF-16 code units
if (trailingBytesCurrentCodePoint >= 3)
++utf16charCounter;
yychar = *(currentSourceChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
yychar = *++currentSourceChar;
}
}
private: private:
void pushLineStartOffset(); void pushLineStartOffset();
void scan_helper(Token *tok); void scan_helper(Token *tok);
...@@ -83,23 +105,7 @@ private: ...@@ -83,23 +105,7 @@ private:
void yyinp() void yyinp()
{ {
++_currentCharUtf16; yyinp_utf8(_currentChar, _yychar, _currentCharUtf16);
// Process multi-byte UTF-8 code point (non-latin1)
if (CPLUSPLUS_UNLIKELY(isByteOfMultiByteCodePoint(_yychar))) {
unsigned trailingBytesCurrentCodePoint = 1;
for (unsigned char c = _yychar << 2; isByteOfMultiByteCodePoint(c); c <<= 1)
++trailingBytesCurrentCodePoint;
// Code points >= 0x00010000 are represented by two UTF16 code units
if (trailingBytesCurrentCodePoint >= 3)
++_currentCharUtf16;
_yychar = *(_currentChar += trailingBytesCurrentCodePoint + 1);
// Process single-byte UTF-8 code point (latin1)
} else {
_yychar = *++_currentChar;
}
if (CPLUSPLUS_UNLIKELY(_yychar == '\n')) if (CPLUSPLUS_UNLIKELY(_yychar == '\n'))
pushLineStartOffset(); pushLineStartOffset();
} }
......
...@@ -264,7 +264,7 @@ void TranslationUnit::tokenize() ...@@ -264,7 +264,7 @@ void TranslationUnit::tokenize()
currentExpanded = true; currentExpanded = true;
const std::pair<unsigned, unsigned> &p = lineColumn[lineColumnIdx]; const std::pair<unsigned, unsigned> &p = lineColumn[lineColumnIdx];
if (p.first) if (p.first)
_expandedLineColumn.insert(std::make_pair(tk.bytesBegin(), p)); _expandedLineColumn.insert(std::make_pair(tk.utf16charsBegin(), p));
else else
currentGenerated = true; currentGenerated = true;
......
...@@ -365,25 +365,31 @@ void Document::appendMacro(const Macro &macro) ...@@ -365,25 +365,31 @@ void Document::appendMacro(const Macro &macro)
_definedMacros.append(macro); _definedMacros.append(macro);
} }
void Document::addMacroUse(const Macro &macro, unsigned offset, unsigned length, void Document::addMacroUse(const Macro &macro,
unsigned bytesOffset, unsigned bytesLength,
unsigned utf16charsOffset, unsigned utf16charLength,
unsigned beginLine, unsigned beginLine,
const QVector<MacroArgumentReference> &actuals) const QVector<MacroArgumentReference> &actuals)
{ {
MacroUse use(macro, offset, offset + length, beginLine); MacroUse use(macro,
bytesOffset, bytesOffset + bytesLength,
utf16charsOffset, utf16charsOffset + utf16charLength,
beginLine);
foreach (const MacroArgumentReference &actual, actuals) { foreach (const MacroArgumentReference &actual, actuals) {
const Block arg(actual.position(), actual.position() + actual.length()); const Block arg(0, 0, actual.utf16charsOffset(),
actual.utf16charsOffset() + actual.utf16charsLength());
use.addArgument(arg); use.addArgument(arg);
} }
_macroUses.append(use); _macroUses.append(use);
} }
void Document::addUndefinedMacroUse(const QByteArray &name, unsigned offset) void Document::addUndefinedMacroUse(const QByteArray &name,
unsigned bytesOffset, unsigned utf16charsOffset)
{ {
QByteArray copy(name.data(), name.size()); QByteArray copy(name.data(), name.size());
UndefinedMacroUse use(copy, offset); UndefinedMacroUse use(copy, bytesOffset, utf16charsOffset);
_undefinedMacroUses.append(use); _undefinedMacroUses.append(use);
} }
...@@ -548,19 +554,23 @@ const Macro *Document::findMacroDefinitionAt(unsigned line) const ...@@ -548,19 +554,23 @@ const Macro *Document::findMacroDefinitionAt(unsigned line) const
return 0; return 0;
} }
const Document::MacroUse *Document::findMacroUseAt(unsigned offset) const const Document::MacroUse *Document::findMacroUseAt(unsigned utf16charsOffset) const
{ {
foreach (const Document::MacroUse &use, _macroUses) { foreach (const Document::MacroUse &use, _macroUses) {
if (use.contains(offset) && (offset < use.begin() + use.macro().name().length())) if (use.containsUtf16charOffset(utf16charsOffset)
&& (utf16charsOffset < use.utf16charsBegin() + use.macro().nameToQString().size())) {
return &use; return &use;
}
} }
return 0; return 0;
} }
const Document::UndefinedMacroUse *Document::findUndefinedMacroUseAt(unsigned offset) const const Document::UndefinedMacroUse *Document::findUndefinedMacroUseAt(unsigned utf16charsOffset) const
{ {
foreach (const Document::UndefinedMacroUse &use, _undefinedMacroUses) { foreach (const Document::UndefinedMacroUse &use, _undefinedMacroUses) {
if (use.contains(offset) && (offset < use.begin() + use.name().length())) if (use.containsUtf16charOffset(utf16charsOffset)
&& (utf16charsOffset < use.utf16charsBegin()
+ QString::fromUtf8(use.name(), use.name().size()).length()))
return &use; return &use;
} }
return 0; return 0;
...@@ -581,21 +591,21 @@ void Document::setUtf8Source(const QByteArray &source) ...@@ -581,21 +591,21 @@ void Document::setUtf8Source(const QByteArray &source)
_translationUnit->setSource(_source.constBegin(), _source.size()); _translationUnit->setSource(_source.constBegin(), _source.size());
} }
void Document::startSkippingBlocks(unsigned start) void Document::startSkippingBlocks(unsigned utf16charsOffset)
{ {
_skippedBlocks.append(Block(start, 0)); _skippedBlocks.append(Block(0, 0, utf16charsOffset, 0));
} }
void Document::stopSkippingBlocks(unsigned stop) void Document::stopSkippingBlocks(unsigned utf16charsOffset)
{ {
if (_skippedBlocks.isEmpty()) if (_skippedBlocks.isEmpty())
return; return;
unsigned start = _skippedBlocks.back().begin(); unsigned start = _skippedBlocks.back().utf16charsBegin();
if (start > stop) if (start > utf16charsOffset)
_skippedBlocks.removeLast(); // Ignore this block, it's invalid. _skippedBlocks.removeLast(); // Ignore this block, it's invalid.
else else
_skippedBlocks.back() = Block(start, stop); _skippedBlocks.back() = Block(0, 0, start, utf16charsOffset);
} }
bool Document::isTokenized() const bool Document::isTokenized() const
......
...@@ -77,10 +77,12 @@ public: ...@@ -77,10 +77,12 @@ public:
QString fileName() const; QString fileName() const;
void appendMacro(const Macro &macro); void appendMacro(const Macro &macro);
void addMacroUse(const Macro &macro, unsigned offset, unsigned length, void addMacroUse(const Macro &macro,
unsigned beginLine, unsigned bytesOffset, unsigned bytesLength,
const QVector<MacroArgumentReference> &range); unsigned utf16charsOffset, unsigned utf16charLength,
void addUndefinedMacroUse(const QByteArray &name, unsigned offset); unsigned beginLine, const QVector<MacroArgumentReference> &range);
void addUndefinedMacroUse(const QByteArray &name,
unsigned bytesOffset, unsigned utf16charsOffset);
Control *control() const; Control *control() const;
TranslationUnit *translationUnit() const; TranslationUnit *translationUnit() const;
...@@ -108,8 +110,8 @@ public: ...@@ -108,8 +110,8 @@ public:
void setFingerprint(const QByteArray &fingerprint) void setFingerprint(const QByteArray &fingerprint)
{ m_fingerprint = fingerprint; } { m_fingerprint = fingerprint; }
void startSkippingBlocks(unsigned offset); void startSkippingBlocks(unsigned utf16charsOffset);
void stopSkippingBlocks(unsigned offset); void stopSkippingBlocks(unsigned utf16charsOffset);
enum ParseMode { // ### keep in sync with CPlusPlus::TranslationUnit enum ParseMode { // ### keep in sync with CPlusPlus::TranslationUnit
ParseTranlationUnit, ParseTranlationUnit,
...@@ -207,22 +209,34 @@ public: ...@@ -207,22 +209,34 @@ public:
class Block class Block
{ {
unsigned _begin; unsigned _bytesBegin;
unsigned _end; unsigned _bytesEnd;
unsigned _utf16charsBegin;
unsigned _utf16charsEnd;
public: public:
inline Block(unsigned begin = 0, unsigned end = 0) inline Block(unsigned bytesBegin = 0, unsigned bytesEnd = 0,
: _begin(begin), _end(end) unsigned utf16charsBegin = 0, unsigned utf16charsEnd = 0)
{ } : _bytesBegin(bytesBegin),
_bytesEnd(bytesEnd),
_utf16charsBegin(utf16charsBegin),
_utf16charsEnd(utf16charsEnd)
{}
inline unsigned bytesBegin() const
{ return _bytesBegin; }
inline unsigned bytesEnd() const
{ return _bytesEnd; }
inline unsigned begin() const inline unsigned utf16charsBegin() const
{ return _begin; } { return _utf16charsBegin; }
inline unsigned end() const inline unsigned utf16charsEnd() const
{ return _end; } { return _utf16charsEnd; }
bool contains(unsigned pos) const bool containsUtf16charOffset(unsigned utf16charOffset) const
{ return pos >= _begin && pos < _end; } { return utf16charOffset >= _utf16charsBegin && utf16charOffset < _utf16charsEnd; }
}; };
class Include { class Include {
...@@ -259,8 +273,11 @@ public: ...@@ -259,8 +273,11 @@ public:
unsigned _beginLine; unsigned _beginLine;
public: public:
inline MacroUse(const Macro &macro, unsigned begin, unsigned end, unsigned beginLine) inline MacroUse(const Macro &macro,
: Block(begin, end), unsigned bytesBegin, unsigned bytesEnd,
unsigned utf16charsBegin, unsigned utf16charsEnd,
unsigned beginLine)
: Block(bytesBegin, bytesEnd, utf16charsBegin, utf16charsEnd),
_macro(macro), _macro(macro),
_beginLine(beginLine) _beginLine(beginLine)
{ } { }
...@@ -293,8 +310,12 @@ public: ...@@ -293,8 +310,12 @@ public:
public: public:
inline UndefinedMacroUse( inline UndefinedMacroUse(
const QByteArray &name, const QByteArray &name,
unsigned begin) unsigned bytesBegin,
: Block(begin, begin + name.length()), unsigned utf16charsBegin)
: Block(bytesBegin,
bytesBegin + name.length(),
utf16charsBegin,
utf16charsBegin + QString::fromUtf8(name, name.size()).size()),
_name(name) _name(name)
{ } { }
...@@ -328,8 +349,8 @@ public: ...@@ -328,8 +349,8 @@ public:
{ return _includeGuardMacroName; } { return _includeGuardMacroName; }
const Macro *findMacroDefinitionAt(unsigned line) const; const Macro *findMacroDefinitionAt(unsigned line) const;
const MacroUse *findMacroUseAt(unsigned offset) const; const MacroUse *findMacroUseAt(unsigned utf16charsOffset) const;
const UndefinedMacroUse *findUndefinedMacroUseAt(unsigned offset) const; const UndefinedMacroUse *findUndefinedMacroUseAt(unsigned utf16charsOffset) const;
void keepSourceAndAST(); void keepSourceAndAST();
void releaseSourceAndAST(); void releaseSourceAndAST();
......
...@@ -108,37 +108,45 @@ static const Macro revision(const Snapshot &s, const Macro &m) ...@@ -108,37 +108,45 @@ static const Macro revision(const Snapshot &s, const Macro &m)
return m; return m;
} }
void FastPreprocessor::passedMacroDefinitionCheck(unsigned offset, unsigned line, const Macro &macro) void FastPreprocessor::passedMacroDefinitionCheck(unsigned bytesOffset, unsigned utf16charsOffset,
unsigned line, const Macro &macro)
{ {
Q_ASSERT(_currentDoc); Q_ASSERT(_currentDoc);
_currentDoc->addMacroUse(revision(_snapshot, macro), _currentDoc->addMacroUse(revision(_snapshot, macro),
offset, macro.name().length(), line, bytesOffset, macro.name().size(),
QVector<MacroArgumentReference>()); utf16charsOffset, macro.nameToQString().size(),
line, QVector<MacroArgumentReference>());
} }
void FastPreprocessor::failedMacroDefinitionCheck(unsigned offset, const ByteArrayRef &name) void FastPreprocessor::failedMacroDefinitionCheck(unsigned bytesOffset, unsigned utf16charsOffset,
const ByteArrayRef &name)
{ {
Q_ASSERT(_currentDoc); Q_ASSERT(_currentDoc);
_currentDoc->addUndefinedMacroUse(QByteArray(name.start(), name.size()), offset); _currentDoc->addUndefinedMacroUse(QByteArray(name.start(), name.size()),
bytesOffset, utf16charsOffset);
} }
void FastPreprocessor::notifyMacroReference(unsigned offset, unsigned line, const Macro &macro) void FastPreprocessor::notifyMacroReference(unsigned bytesOffset, unsigned utf16charsOffset,
unsigned line, const Macro &macro)
{ {
Q_ASSERT(_currentDoc); Q_ASSERT(_currentDoc);
_currentDoc->addMacroUse(revision(_snapshot, macro), _currentDoc->addMacroUse(revision(_snapshot, macro),
offset, macro.name().length(), line, bytesOffset, macro.name().size(),
QVector<MacroArgumentReference>()); utf16charsOffset, macro.nameToQString().size(),
line, QVector<MacroArgumentReference>());
} }
void FastPreprocessor::startExpandingMacro(unsigned offset, unsigned line, void FastPreprocessor::startExpandingMacro(unsigned bytesOffset, unsigned utf16charsOffset,
const Macro &macro, unsigned line, const Macro &macro,
const QVector<MacroArgumentReference> &actuals) const QVector<MacroArgumentReference> &actuals)
{ {
Q_ASSERT(_currentDoc); Q_ASSERT(_currentDoc);
_currentDoc->addMacroUse(revision(_snapshot, macro), _currentDoc->addMacroUse(revision(_snapshot, macro),
offset, macro.name().length(), line, actuals); bytesOffset, macro.name().size(),
utf16charsOffset, macro.nameToQString().size(),
line, actuals);
} }
...@@ -61,12 +61,13 @@ public: ...@@ -61,12 +61,13 @@ public:
virtual void macroAdded(const Macro &); virtual void macroAdded(const Macro &);
virtual void passedMacroDefinitionCheck(unsigned, unsigned, const Macro &); virtual void passedMacroDefinitionCheck(unsigned, unsigned, unsigned, const Macro &);
virtual void failedMacroDefinitionCheck(unsigned, const ByteArrayRef &); virtual void failedMacroDefinitionCheck(unsigned, unsigned, const ByteArrayRef &);
virtual void notifyMacroReference(unsigned, unsigned, const Macro &); virtual void notifyMacroReference(unsigned, unsigned, unsigned, const Macro &);
virtual void startExpandingMacro(unsigned, virtual void startExpandingMacro(unsigned,
unsigned,
unsigned, unsigned,
const Macro &, const Macro &,
const QVector<MacroArgumentReference> &); const QVector<MacroArgumentReference> &);
......
...@@ -55,7 +55,8 @@ Macro::Macro() ...@@ -55,7 +55,8 @@ Macro::Macro()
_hashcode(0), _hashcode(0),
_fileRevision(0), _fileRevision(0),
_line(0), _line(0),
_offset(0), _bytesOffset(0),
_utf16charsOffset(0),
_length(0), _length(0),
_state(0) _state(0)
{ } { }
......
...@@ -71,6 +71,9 @@ public: ...@@ -71,6 +71,9 @@ public:
QByteArray name() const QByteArray name() const
{ return _name; } { return _name; }
QString nameToQString() const
{ return QString::fromUtf8(_name, _name.size()); }
void setName(const QByteArray &name) void setName(const QByteArray &name)
{ _name = name; } { _name = name; }
...@@ -107,11 +110,17 @@ public: ...@@ -107,11 +110,17 @@ public:
void setLine(unsigned line) void setLine(unsigned line)
{ _line = line; } { _line = line; }
unsigned offset() const unsigned bytesOffset() const
{ return _offset; } { return _bytesOffset; }
void setBytesOffset(unsigned bytesOffset)
{ _bytesOffset = bytesOffset; }
unsigned utf16CharOffset() const
{ return _utf16charsOffset; }
void setOffset(unsigned offset) void setUtf16charOffset(unsigned utf16charOffset)
{ _offset = offset; } { _utf16charsOffset = utf16charOffset; }
unsigned length() const unsigned length() const
{ return _length; } { return _length; }
...@@ -161,7 +170,8 @@ private: ...@@ -161,7 +170,8 @@ private:
unsigned _hashcode; unsigned _hashcode;
unsigned _fileRevision; unsigned _fileRevision;
unsigned _line; unsigned _line;
unsigned _offset; unsigned _bytesOffset;
unsigned _utf16charsOffset;
unsigned _length; unsigned _length;
union union
......
...@@ -58,5 +58,6 @@ void Internal::PPToken::squeezeSource() ...@@ -58,5 +58,6 @@ void Internal::PPToken::squeezeSource()
m_src = m_src.mid(byteOffset, f.bytes); m_src = m_src.mid(byteOffset, f.bytes);
m_src.squeeze(); m_src.squeeze();
byteOffset = 0; byteOffset = 0;
utf16charOffset = 0;
} }
} }
...@@ -46,19 +46,19 @@ class Macro; ...@@ -46,19 +46,19 @@ class Macro;
class CPLUSPLUS_EXPORT MacroArgumentReference class CPLUSPLUS_EXPORT MacroArgumentReference
{ {
unsigned _position; unsigned _utf16charsOffset;
unsigned _length; unsigned _utf16charsLength;
public: public:
explicit MacroArgumentReference(unsigned position = 0, unsigned length = 0) explicit MacroArgumentReference(unsigned utf16charsOffset = 0, unsigned utf16charsLength = 0)
: _position(position), _length(length) : _utf16charsOffset(utf16charsOffset), _utf16charsLength(utf16charsLength)
{ } { }
unsigned position() const unsigned utf16charsOffset() const
{ return _position; } { return _utf16charsOffset; }
unsigned length() const unsigned utf16charsLength() const
{ return _length; } { return _utf16charsLength; }
}; };