Commit 23c637c4 authored by Leandro Melo's avatar Leandro Melo Committed by hjk

C++: Introduce unicode char/strings support

Those are the types char16_t and char32_t along with the new
char/string literals u'', U'', u"", u8"", and U"".

This is particularly important for the use of QStringLiteral
since in some platforms it relies on expansion such as above.

Note: The string literals quickfixes still need some tunning.

Task-number: QTCREATORBUG-7449
Change-Id: Iebcfea15677dc8e0ebb6143def89a5477e1be7d4
Reviewed-by: default avatarhjk <qthjk@ovi.com>
parent b88a5f5d
......@@ -2750,6 +2750,18 @@ bool Bind::visit(SimpleSpecifierAST *ast)
_type.setType(control()->integerType(IntegerType::Char));
break;
case T_CHAR16_T:
if (_type)
translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
_type.setType(control()->integerType(IntegerType::Char16));
break;
case T_CHAR32_T:
if (_type)
translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
_type.setType(control()->integerType(IntegerType::Char32));
break;
case T_WCHAR_T:
if (_type)
translationUnit()->error(ast->specifier_token, "duplicate data type in declaration");
......
......@@ -70,6 +70,8 @@ class CPLUSPLUS_EXPORT IntegerType: public Type
public:
enum Kind {
Char,
Char16,
Char32,
WideChar,
Bool,
Short,
......
......@@ -778,6 +778,28 @@ static inline int classify8(const char *s, bool q, bool x) {
}
}
}
} else if (x && s[1] == 'h') {
if (s[2] == 'a') {
if (s[3] == 'r') {
if (s[4] == '1') {
if (s[5] == '6') {
if (s[6] == '_') {
if (s[7] == 't') {
return T_CHAR16_T;
}
}
}
} else if (s[4] == '3') {
if (s[5] == '2') {
if (s[6] == '_') {
if (s[7] == 't') {
return T_CHAR32_T;
}
}
}
}
}
}
}
}
else if (x && s[0] == 'd') {
......
......@@ -224,37 +224,13 @@ void Lexer::scan_helper(Token *tok)
}
goto _Lagain;
case '"': case '\'': {
const char quote = ch;
tok->f.kind = quote == '"'
? T_STRING_LITERAL
: T_CHAR_LITERAL;
const char *yytext = _currentChar;
while (_yychar && _yychar != quote) {
if (_yychar == '\n')
break;
else if (_yychar != '\\')
yyinp();
else {
yyinp(); // skip `\\'
if (_yychar)
yyinp();
}
}
// assert(_yychar == quote);
int yylen = _currentChar - yytext;
if (_yychar == quote)
yyinp();
case '"':
scanStringLiteral(tok);
break;
if (control())
tok->string = control()->stringLiteral(yytext, yylen);
} break;
case '\'':
scanCharLiteral(tok);
break;
case '{':
tok->f.kind = T_LBRACE;
......@@ -589,112 +565,148 @@ void Lexer::scan_helper(Token *tok)
tok->f.kind = classifyObjCAtKeyword(yytext, yylen);
break;
} else if (ch == '@' && _yychar == '"') {
// objc @string literals
yyinp();
tok->f.kind = T_AT_STRING_LITERAL;
const char *yytext = _currentChar;
scanStringLiteral(tok, '"');
break;
}
}
while (_yychar && _yychar != '"') {
if (_yychar != '\\')
if (ch == 'L' || ch == 'u' || ch == 'U') {
// Either a literal or still an identifier.
if (_yychar == '"') {
yyinp();
scanStringLiteral(tok, ch);
} else if (_yychar == '\'') {
yyinp();
scanCharLiteral(tok, ch);
} else {
if (_yychar == '8') {
unsigned char la = 0;
if (_currentChar + 1 != _lastChar)
la = *(_currentChar + 1);
if (la == '"') {
yyinp();
else {
yyinp(); // skip `\\'
if (_yychar)
yyinp();
yyinp();
scanStringLiteral(tok, '8');
} else if (la == '\'') {
yyinp();
yyinp();
scanCharLiteral(tok, '8');
} else {
scanIdentifier(tok);
}
} else {
scanIdentifier(tok);
}
// assert(_yychar == '"');
int yylen = _currentChar - yytext;
if (_yychar == '"')
yyinp();
if (control())
tok->string = control()->stringLiteral(yytext, yylen);
break;
}
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
scanIdentifier(tok);
} else if (std::isdigit(ch)) {
scanNumericLiteral(tok);
} else {
tok->f.kind = T_ERROR;
}
break;
} // default
if (ch == 'L' && (_yychar == '"' || _yychar == '\'')) {
// wide char/string literals
ch = _yychar;
yyinp();
const char quote = ch;
tok->f.kind = quote == '"'
? T_WIDE_STRING_LITERAL
: T_WIDE_CHAR_LITERAL;
} // switch
}
const char *yytext = _currentChar;
void Lexer::scanStringLiteral(Token *tok, unsigned char hint)
{
scanUntilQuote(tok, '"');
if (hint == 'L')
tok->f.kind = T_WIDE_STRING_LITERAL;
else if (hint == 'U')
tok->f.kind = T_UTF32_STRING_LITERAL;
else if (hint == 'u')
tok->f.kind = T_UTF16_STRING_LITERAL;
else if (hint == '8')
tok->f.kind = T_UTF8_STRING_LITERAL;
else if (hint == '@')
tok->f.kind = T_AT_STRING_LITERAL;
else
tok->f.kind = T_STRING_LITERAL;
}
while (_yychar && _yychar != quote) {
if (_yychar != '\\')
yyinp();
else {
yyinp(); // skip `\\'
void Lexer::scanCharLiteral(Token *tok, unsigned char hint)
{
scanUntilQuote(tok, '\'');
if (hint == 'L')
tok->f.kind = T_WIDE_CHAR_LITERAL;
else if (hint == 'U')
tok->f.kind = T_UTF32_CHAR_LITERAL;
else if (hint == 'u')
tok->f.kind = T_UTF16_CHAR_LITERAL;
else
tok->f.kind = T_CHAR_LITERAL;
}
if (_yychar)
yyinp();
}
}
// assert(_yychar == quote);
void Lexer::scanUntilQuote(Token *tok, unsigned char quote)
{
assert(quote == '"' || quote == '\'');
int yylen = _currentChar - yytext;
const char *yytext = _currentChar;
while (_yychar && _yychar != quote) {
if (_yychar != '\\')
yyinp();
else {
yyinp(); // skip `\\'
if (_yychar == quote)
if (_yychar)
yyinp();
}
}
int yylen = _currentChar - yytext;
if (control())
tok->string = control()->stringLiteral(yytext, yylen);
} else if (std::isalpha(ch) || ch == '_' || ch == '$') {
const char *yytext = _currentChar - 1;
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
yyinp();
int yylen = _currentChar - yytext;
if (f._scanKeywords)
tok->f.kind = classify(yytext, yylen, f._qtMocRunEnabled, f._cxx0xEnabled);
else
tok->f.kind = T_IDENTIFIER;
if (_yychar == quote)
yyinp();
if (tok->f.kind == T_IDENTIFIER) {
tok->f.kind = classifyOperator(yytext, yylen);
if (control())
tok->string = control()->stringLiteral(yytext, yylen);
}
if (control())
tok->identifier = control()->identifier(yytext, yylen);
}
break;
} else if (std::isdigit(ch)) {
const char *yytext = _currentChar - 1;
while (_yychar) {
if (_yychar == 'e' || _yychar == 'E') {
yyinp();
if (_yychar == '-' || _yychar == '+') {
yyinp();
// ### assert(std::isdigit(_yychar));
}
} else if (std::isalnum(_yychar) || _yychar == '.') {
yyinp();
} else {
break;
}
void Lexer::scanNumericLiteral(Token *tok)
{
const char *yytext = _currentChar - 1;
while (_yychar) {
if (_yychar == 'e' || _yychar == 'E') {
yyinp();
if (_yychar == '-' || _yychar == '+') {
yyinp();
// ### assert(std::isdigit(_yychar));
}
int yylen = _currentChar - yytext;
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
break;
} else if (std::isalnum(_yychar) || _yychar == '.') {
yyinp();
} else {
tok->f.kind = T_ERROR;
break;
}
} // default
}
int yylen = _currentChar - yytext;
} // switch
tok->f.kind = T_NUMERIC_LITERAL;
if (control())
tok->number = control()->numericLiteral(yytext, yylen);
}
void Lexer::scanIdentifier(Token *tok)
{
const char *yytext = _currentChar - 1;
while (std::isalnum(_yychar) || _yychar == '_' || _yychar == '$')
yyinp();
int yylen = _currentChar - yytext;
if (f._scanKeywords)
tok->f.kind = classify(yytext, yylen, f._qtMocRunEnabled, f._cxx0xEnabled);
else
tok->f.kind = T_IDENTIFIER;
if (tok->f.kind == T_IDENTIFIER) {
tok->f.kind = classifyOperator(yytext, yylen);
if (control())
tok->identifier = control()->identifier(yytext, yylen);
}
}
......@@ -90,6 +90,12 @@ private:
static int classifyObjCAtKeyword(const char *s, int n);
static int classifyOperator(const char *string, int length);
void scanStringLiteral(Token *tok, unsigned char hint = 0);
void scanCharLiteral(Token *tok, unsigned char hint = 0);
void scanUntilQuote(Token *tok, unsigned char quote);
void scanNumericLiteral(Token *tok);
void scanIdentifier(Token *tok);
inline void yyinp()
{
if (++_currentChar == _lastChar)
......
......@@ -313,6 +313,8 @@ bool Parser::skipUntilStatement()
case T_CATCH:
case T_THROW:
case T_CHAR:
case T_CHAR16_T:
case T_CHAR32_T:
case T_WCHAR_T:
case T_BOOL:
case T_SHORT:
......@@ -2811,12 +2813,21 @@ bool Parser::parseUnqualifiedName(NameAST *&node, bool acceptTemplateId)
bool Parser::parseStringLiteral(ExpressionAST *&node)
{
DEBUG_THIS_RULE();
if (! (LA() == T_STRING_LITERAL || LA() == T_WIDE_STRING_LITERAL))
if (! (LA() == T_STRING_LITERAL
|| LA() == T_WIDE_STRING_LITERAL
|| LA() == T_UTF8_STRING_LITERAL
|| LA() == T_UTF16_STRING_LITERAL
|| LA() == T_UTF32_STRING_LITERAL)) {
return false;
}
StringLiteralAST **ast = reinterpret_cast<StringLiteralAST **> (&node);
while (LA() == T_STRING_LITERAL || LA() == T_WIDE_STRING_LITERAL) {
while (LA() == T_STRING_LITERAL
|| LA() == T_WIDE_STRING_LITERAL
|| LA() == T_UTF8_STRING_LITERAL
|| LA() == T_UTF16_STRING_LITERAL
|| LA() == T_UTF32_STRING_LITERAL) {
*ast = new (_pool) StringLiteralAST;
(*ast)->literal_token = consumeToken();
ast = &(*ast)->next;
......@@ -3541,6 +3552,8 @@ bool Parser::lookAtBuiltinTypeSpecifier() const
{
switch (LA()) {
case T_CHAR:
case T_CHAR16_T:
case T_CHAR32_T:
case T_WCHAR_T:
case T_BOOL:
case T_SHORT:
......@@ -3982,7 +3995,9 @@ bool Parser::parseNumericLiteral(ExpressionAST *&node)
DEBUG_THIS_RULE();
if (LA() == T_NUMERIC_LITERAL ||
LA() == T_CHAR_LITERAL ||
LA() == T_WIDE_CHAR_LITERAL) {
LA() == T_WIDE_CHAR_LITERAL ||
LA() == T_UTF16_CHAR_LITERAL ||
LA() == T_UTF32_CHAR_LITERAL) {
NumericLiteralAST *ast = new (_pool) NumericLiteralAST;
ast->literal_token = consumeToken();
node = ast;
......@@ -4021,6 +4036,9 @@ bool Parser::parsePrimaryExpression(ExpressionAST *&node)
switch (LA()) {
case T_STRING_LITERAL:
case T_WIDE_STRING_LITERAL:
case T_UTF8_STRING_LITERAL:
case T_UTF16_STRING_LITERAL:
case T_UTF32_STRING_LITERAL:
return parseStringLiteral(node);
case T_NULLPTR:
......@@ -4030,6 +4048,8 @@ bool Parser::parsePrimaryExpression(ExpressionAST *&node)
case T_CHAR_LITERAL: // ### FIXME don't use NumericLiteral for chars
case T_WIDE_CHAR_LITERAL:
case T_UTF16_CHAR_LITERAL:
case T_UTF32_CHAR_LITERAL:
case T_NUMERIC_LITERAL:
return parseNumericLiteral(node);
......
......@@ -29,8 +29,12 @@ static const char *token_names[] = {
("<C++ comment>"), ("<C++ doxy comment>"),
("<comment>"), ("<doxy comment>"),
("<identifier>"), ("<numeric literal>"), ("<char literal>"),
("<wide char literal>"), ("<string literal>"), ("<wide char literal>"),
("<identifier>"),
("<numeric literal>"),
("<char literal>"), ("<wide char literal>"), ("<utf16 char literal>"), ("<utf32 char literal>"),
("<string literal>"), ("<wide string literal>"), ("<utf8 string literal>"),
("<utf16 string literal>"), ("<utf32 string literal>"),
("<@string literal>"), ("<angle string literal>"),
("&"), ("&&"), ("&="), ("->"), ("->*"), ("^"), ("^="), (":"), ("::"),
......@@ -40,7 +44,8 @@ static const char *token_names[] = {
("|="), ("||"), ("+"), ("+="), ("++"), ("#"), ("##"), ("?"), ("}"),
("]"), (")"), (";"), ("*"), ("*="), ("~"), ("~="),
("asm"), ("auto"), ("bool"), ("break"), ("case"), ("catch"), ("char"),
("asm"), ("auto"), ("bool"), ("break"), ("case"), ("catch"),
("char"), ("char16_t"), ("char32_t"),
("class"), ("const"), ("const_cast"), ("constexpr"), ("continue"),
("decltype"), ("default"),
("delete"), ("do"), ("double"), ("dynamic_cast"), ("else"), ("enum"),
......@@ -92,11 +97,16 @@ const char *Token::spell() const
case T_NUMERIC_LITERAL:
case T_CHAR_LITERAL:
case T_WIDE_CHAR_LITERAL:
case T_UTF16_CHAR_LITERAL:
case T_UTF32_CHAR_LITERAL:
case T_STRING_LITERAL:
case T_WIDE_STRING_LITERAL:
case T_UTF8_STRING_LITERAL:
case T_UTF16_STRING_LITERAL:
case T_UTF32_STRING_LITERAL:
case T_AT_STRING_LITERAL:
case T_ANGLE_STRING_LITERAL:
case T_WIDE_CHAR_LITERAL:
case T_WIDE_STRING_LITERAL:
return literal->chars();
default:
......
......@@ -40,10 +40,15 @@ enum Kind {
T_FIRST_CHAR_LITERAL,
T_CHAR_LITERAL = T_FIRST_CHAR_LITERAL,
T_WIDE_CHAR_LITERAL,
T_LAST_CHAR_LITERAL = T_WIDE_CHAR_LITERAL,
T_UTF16_CHAR_LITERAL,
T_UTF32_CHAR_LITERAL,
T_LAST_CHAR_LITERAL = T_UTF32_CHAR_LITERAL,
T_FIRST_STRING_LITERAL,
T_STRING_LITERAL = T_FIRST_STRING_LITERAL,
T_WIDE_STRING_LITERAL,
T_UTF8_STRING_LITERAL,
T_UTF16_STRING_LITERAL,
T_UTF32_STRING_LITERAL,
T_AT_STRING_LITERAL,
T_ANGLE_STRING_LITERAL,
T_LAST_STRING_LITERAL = T_ANGLE_STRING_LITERAL,
......@@ -112,6 +117,8 @@ enum Kind {
T_CASE,
T_CATCH,
T_CHAR,
T_CHAR16_T,
T_CHAR32_T,
T_CLASS,
T_CONST,
T_CONST_CAST,
......
......@@ -205,8 +205,8 @@ void TranslationUnit::tokenize()
unsigned line = (unsigned) strtoul(tk.spell(), 0, 0);
lex(&tk);
if (! tk.f.newline && tk.is(T_STRING_LITERAL)) {
const StringLiteral *fileName = control()->stringLiteral(tk.string->chars(),
tk.string->size());
const StringLiteral *fileName =
control()->stringLiteral(tk.string->chars(), tk.string->size());
pushPreprocessorLine(offset, line, fileName);
lex(&tk);
}
......
......@@ -170,7 +170,7 @@ QString MatchingText::insertMatchingBrace(const QTextCursor &cursor, const QStri
const Token &token = tk[index - 1];
if (text.at(0) == QLatin1Char('"') && (token.is(T_STRING_LITERAL) || token.is(T_WIDE_STRING_LITERAL))) {
if (text.at(0) == QLatin1Char('"') && token.isStringLiteral()) {
if (text.length() != 1)
qWarning() << Q_FUNC_INFO << "handle event compression";
......@@ -178,7 +178,7 @@ QString MatchingText::insertMatchingBrace(const QTextCursor &cursor, const QStri
return QLatin1String("\"");
return QString();
} else if (text.at(0) == QLatin1Char('\'') && (token.is(T_CHAR_LITERAL) || token.is(T_WIDE_CHAR_LITERAL))) {
} else if (text.at(0) == QLatin1Char('\'') && token.isCharLiteral()) {
if (text.length() != 1)
qWarning() << Q_FUNC_INFO << "handle event compression";
......
......@@ -289,13 +289,16 @@ bool ResolveExpression::visit(NumericLiteralAST *ast)
Type *type = 0;
bool isUnsigned = false;
if (tk.is(T_CHAR_LITERAL))
if (tk.is(T_CHAR_LITERAL)) {
type = control()->integerType(IntegerType::Char);
else if (tk.is(T_WIDE_CHAR_LITERAL))
} else if (tk.is(T_WIDE_CHAR_LITERAL)) {
type = control()->integerType(IntegerType::WideChar);
else if (const NumericLiteral *literal = numericLiteral(ast->literal_token)) {
} else if (tk.is(T_UTF16_CHAR_LITERAL)) {
type = control()->integerType(IntegerType::Char16);
} else if (tk.is(T_UTF32_CHAR_LITERAL)) {
type = control()->integerType(IntegerType::Char32);
} else if (const NumericLiteral *literal = numericLiteral(ast->literal_token)) {
isUnsigned = literal->isUnsigned();
if (literal->isInt())
type = control()->integerType(IntegerType::Int);
else if (literal->isLong())
......
......@@ -185,6 +185,12 @@ void TypePrettyPrinter::visit(IntegerType *type)
case IntegerType::Char:
_text.prepend(QLatin1String("char"));
break;
case IntegerType::Char16:
_text.prepend(QLatin1String("char16_t"));
break;
case IntegerType::Char32:
_text.prepend(QLatin1String("char32_t"));
break;
case IntegerType::WideChar:
_text.prepend(QLatin1String("wchar_t"));
break;
......
......@@ -76,8 +76,7 @@ bool CppAutoCompleter::contextAllowsElectricCharacters(const QTextCursor &cursor
if (isInCommentHelper(cursor, &token))
return false;
if (token.is(T_STRING_LITERAL) || token.is(T_WIDE_STRING_LITERAL)
|| token.is(T_CHAR_LITERAL) || token.is(T_WIDE_CHAR_LITERAL)) {
if (token.isStringLiteral() || token.isCharLiteral()) {
const unsigned pos = cursor.selectionEnd() - cursor.block().position();
if (pos <= token.end())
return false;
......
......@@ -456,6 +456,8 @@ static bool canReplaceSpecifier(TranslationUnit *translationUnit, SpecifierAST *
case T_CONST:
case T_VOLATILE:
case T_CHAR:
case T_CHAR16_T:
case T_CHAR32_T:
case T_WCHAR_T:
case T_BOOL:
case T_SHORT:
......
......@@ -167,15 +167,10 @@ void CppHighlighter::highlightBlock(const QString &text)
} else if (tk.is(T_NUMERIC_LITERAL))
setFormat(tk.begin(), tk.length(), m_formats[CppNumberFormat]);
else if (tk.is(T_STRING_LITERAL) || tk.is(T_CHAR_LITERAL) || tk.is(T_ANGLE_STRING_LITERAL) ||
tk.is(T_AT_STRING_LITERAL))
setFormat(tk.begin(), tk.length(), m_formats[CppStringFormat]);
else if (tk.is(T_WIDE_STRING_LITERAL) || tk.is(T_WIDE_CHAR_LITERAL))
else if (tk.isStringLiteral() || tk.isCharLiteral())
setFormat(tk.begin(), tk.length(), m_formats[CppStringFormat]);
else if (tk.isComment()) {
if (tk.is(T_COMMENT) || tk.is(T_CPP_COMMENT))
setFormat(tk.begin(), tk.length(), m_formats[CppCommentFormat]);
......
......@@ -851,6 +851,8 @@ bool CodeFormatter::tryDeclaration()
}
// fallthrough
case T_CHAR:
case T_CHAR16_T:
case T_CHAR32_T:
case T_WCHAR_T:
case T_BOOL:
case T_SHORT:
......
......@@ -89,9 +89,7 @@ bool GLSLCompleter::contextAllowsElectricCharacters(const QTextCursor &cursor) c
if (pos < tk.end())
return false;
}
else if (tk.is(T_STRING_LITERAL) || tk.is(T_WIDE_STRING_LITERAL)
|| tk.is(T_CHAR_LITERAL) || tk.is(T_WIDE_CHAR_LITERAL)) {
else if (tk.isStringLiteral() || tk.isCharLiteral()) {
const unsigned pos = cursor.selectionEnd() - cursor.block().position();
if (pos <= tk.end())
return false;
......