Files
triton/lib/lang/scanner.cc

453 lines
10 KiB
C++

#include "triton/lang/scanner.h"
#include <cctype>
#include <climits>
void Scanner::Tokenize(TokenSequence& ts) {
while (true) {
auto tok = Scan();
if (tok->tag_ == Token::END) {
if (ts.Empty() || (ts.Back()->tag_ != Token::NEW_LINE)) {
auto t = Token::New(*tok);
t->tag_ = Token::NEW_LINE;
t->str_ = "\n";
ts.InsertBack(t);
}
break;
} else {
if (!ts.Empty() && ts.Back()->tag_ == Token::NEW_LINE)
tok->ws_ = true;
ts.InsertBack(tok);
}
}
}
std::string Scanner::ScanHeadName(const Token* lhs, const Token* rhs) {
std::string str;
const char* begin = lhs->loc_.Begin() + 1;
const char* end = rhs->loc_.Begin();
for (; begin != end; ++begin) {
if (*begin == '\n' && str.back() == '\\')
str.pop_back();
else
str.push_back(*begin);
}
return str;
}
Token* Scanner::Scan(bool ws) {
tok_.ws_ = ws;
SkipWhiteSpace();
Mark();
if (Test('\n')) {
auto ret = MakeNewLine();
Next();
return ret;
}
auto c = Next();
switch (c) {
case '#': return MakeToken(Try('#') ? Token::DSHARP: c);
case ':': return MakeToken(Try('>') ? ']': c);
case '(': case ')': case '[': case ']':
case '?': case ',': case '{': case '}':
case '~': case ';': case '@':
return MakeToken(c);
case '-':
if (Try('>')) return MakeToken(Token::PTR);
if (Try('-')) return MakeToken(Token::DEC);
if (Try('=')) return MakeToken(Token::SUB_ASSIGN);
return MakeToken(c);
case '+':
if (Try('+')) return MakeToken(Token::INC);
if (Try('=')) return MakeToken(Token::ADD_ASSIGN);
return MakeToken(c);
case '<':
if (Try('<')) return MakeToken(Try('=') ? Token::LEFT_ASSIGN: Token::LEFT);
if (Try('=')) return MakeToken(Token::LE);
if (Try(':')) return MakeToken('[');
if (Try('%')) return MakeToken('{');
return MakeToken(c);
case '%':
if (Try('=')) return MakeToken(Token::MOD_ASSIGN);
if (Try('>')) return MakeToken('}');
if (Try(':')) {
if (Try('%')) {
if (Try(':')) return MakeToken(Token::DSHARP);
PutBack();
}
return MakeToken('#');
}
return MakeToken(c);
case '>':
if (Try('>')) return MakeToken(Try('=') ? Token::RIGHT_ASSIGN: Token::RIGHT);
if (Try('=')) return MakeToken(Token::GE);
return MakeToken(c);
case '=': return MakeToken(Try('=') ? Token::EQ: c);
case '!': return MakeToken(Try('=') ? Token::NE: c);
case '&':
if (Try('&')) return MakeToken(Token::LOGICAL_AND);
if (Try('=')) return MakeToken(Token::AND_ASSIGN);
return MakeToken(c);
case '|':
if (Try('|')) return MakeToken(Token::LOGICAL_OR);
if (Try('=')) return MakeToken(Token::OR_ASSIGN);
return MakeToken(c);
case '*': return MakeToken(Try('=') ? Token::MUL_ASSIGN: c);
case '/':
if (Test('/') || Test('*')) {
SkipComment();
return Scan(true);
}
return MakeToken(Try('=') ? Token::DIV_ASSIGN: c);
case '^': return MakeToken(Try('=') ? Token::XOR_ASSIGN: c);
case '.':
if (isdigit(Peek())) return SkipNumber();
if (Try('.')) {
if (Try('.')) return MakeToken(Token::ELLIPSIS);
PutBack();
return MakeToken('.');
}
return MakeToken(c);
case '0' ... '9': return SkipNumber();
case 'u': case 'U': case 'L': {
/*auto enc = */ScanEncoding(c);
if (Try('\'')) return SkipCharacter();
if (Try('\"')) return SkipLiteral();
return SkipIdentifier();
}
case '\'': return SkipCharacter();
case '\"': return SkipLiteral();
case 'a' ... 't': case 'v' ... 'z': case 'A' ... 'K':
case 'M' ... 'T': case 'V' ... 'Z': case '_': case '$':
case 0x80 ... 0xfd:
return SkipIdentifier();
case '\\':
// Universal character name is allowed in identifier
if (Test('u') || Test('U'))
return SkipIdentifier();
return MakeToken(Token::INVALID);
case '\0': return MakeToken(Token::END);
default: return MakeToken(Token::INVALID);
}
}
void Scanner::SkipWhiteSpace() {
while (isspace(Peek()) && Peek() != '\n') {
tok_.ws_ = true;
Next();
}
}
void Scanner::SkipComment() {
if (Try('/')) {
// Line comment terminated an newline or eof
while (!Empty()) {
if (Peek() == '\n')
return;
Next();
}
return;
} else if (Try('*')) {
while (!Empty()) {
auto c = Next();
if (c == '*' && Peek() == '/') {
Next();
return;
}
}
Error(loc_, "unterminated block comment");
}
assert(false);
}
std::string Scanner::ScanIdentifier() {
std::string val;
while (!Empty()) {
auto c = Next();
if (IsUCN(c)) {
c = ScanEscaped(); // Call ScanUCN()
AppendUCN(val, c);
} else {
val.push_back(c);
}
}
return val;
}
Token* Scanner::SkipIdentifier() {
PutBack();
auto c = Next();
while (isalnum(c)
|| (0x80 <= c && c <= 0xfd)
|| c == '_'
|| c == '$'
|| IsUCN(c)) {
if (IsUCN(c))
c = ScanEscaped(); // Just read it
c = Next();
}
PutBack();
return MakeToken(Token::IDENTIFIER);
}
// Scan PP-Number
Token* Scanner::SkipNumber() {
PutBack();
bool sawHexPrefix = false;
int tag = Token::I_CONSTANT;
auto c = Next();
while (c == '.' || isdigit(c) || isalpha(c) || c == '_' || IsUCN(c)) {
if (c == 'e' || c =='E' || c == 'p' || c == 'P') {
if (!Try('-')) Try('+');
if (!((c == 'e' || c == 'E') && sawHexPrefix))
tag = Token::F_CONSTANT;
} else if (IsUCN(c)) {
ScanEscaped();
} else if (c == '.') {
tag = Token::F_CONSTANT;
} else if (c == 'x' || c == 'X') {
sawHexPrefix = true;
}
c = Next();
}
PutBack();
return MakeToken(tag);
}
Encoding Scanner::ScanLiteral(std::string& val) {
auto enc = Test('\"') ? Encoding::NONE: ScanEncoding(Next());
Next();
val.resize(0);
while (!Test('\"')) {
auto c = Next();
bool isucn = IsUCN(c);
if (c == '\\')
c = ScanEscaped();
if (isucn)
AppendUCN(val, c);
else
val.push_back(c);
}
return enc;
}
Token* Scanner::SkipLiteral() {
auto c = Next();
while (c != '\"' && c != '\n' && c != '\0') {
if (c == '\\') Next();
c = Next();
}
if (c != '\"')
Error(loc_, "unterminated string literal");
return MakeToken(Token::LITERAL);
}
Encoding Scanner::ScanCharacter(int& val) {
auto enc = Test('\'') ? Encoding::NONE: ScanEncoding(Next());
Next();
val = 0;
while (!Test('\'')) {
auto c = Next();
if (c == '\\')
c = ScanEscaped();
if (enc == Encoding::NONE)
val = (val << 8) + c;
else
val = c;
}
return enc;
}
Token* Scanner::SkipCharacter() {
auto c = Next();
while (c != '\'' && c != '\n' && c != '\0') {
if (c == '\\') Next();
c = Next();
}
if (c != '\'')
Error(loc_, "unterminated character constant");
return MakeToken(Token::C_CONSTANT);
}
int Scanner::ScanEscaped() {
auto c = Next();
switch (c) {
case '\\': case '\'': case '\"': case '\?':
return c;
case 'a': return '\a';
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'v': return '\v';
// Non-standard GCC extention
case 'e': return '\033';
case 'x': return ScanHexEscaped();
case '0' ... '7': return ScanOctEscaped(c);
case 'u': return ScanUCN(4);
case 'U': return ScanUCN(8);
default: Error(loc_, "unrecognized escape character '%c'", c);
}
return c; // Make compiler happy
}
int Scanner::ScanHexEscaped() {
int val = 0, c = Peek();
if (!isxdigit(c))
Error(loc_, "expect xdigit, but got '%c'", c);
while (isxdigit(c)) {
val = (val << 4) + XDigit(c);
Next();
c = Peek();
}
return val;
}
int Scanner::ScanOctEscaped(int c) {
int val = XDigit(c);
c = Peek();
if (!IsOctal(c))
return val;
val = (val << 3) + XDigit(c);
Next();
c = Peek();
if (!IsOctal(c))
return val;
val = (val << 3) + XDigit(c);
Next();
return val;
}
int Scanner::ScanUCN(int len) {
assert(len == 4 || len == 8);
int val = 0;
for (auto i = 0; i < len; ++i) {
auto c = Next();
if (!isxdigit(c))
Error(loc_, "expect xdigit, but got '%c'", c);
val = (val << 4) + XDigit(c);
}
return val;
}
int Scanner::XDigit(int c) {
switch (c) {
case '0' ... '9': return c - '0';
case 'a' ... 'z': return c - 'a' + 10;
case 'A' ... 'Z': return c - 'A' + 10;
default: assert(false); return c;
}
}
Encoding Scanner::ScanEncoding(int c) {
switch (c) {
case 'u': return Try('8') ? Encoding::UTF8: Encoding::CHAR16;
case 'U': return Encoding::CHAR32;
case 'L': return Encoding::WCHAR;
default: assert(false); return Encoding::NONE;
}
}
std::string* ReadFile(const std::string& filename) {
FILE* f = fopen(filename.c_str(), "r");
if (!f) Error("%s: No such file or directory", filename.c_str());
auto text = new std::string;
int c;
while (EOF != (c = fgetc(f)))
text->push_back(c);
fclose(f);
return text;
}
int Scanner::Next() {
int c = Peek();
++p_;
if (c == '\n') {
++loc_.line_;
loc_.column_ = 1;
loc_.lineBegin_ = p_;
} else {
++loc_.column_;
}
return c;
}
int Scanner::Peek() {
int c = (uint8_t)(*p_);
if (c == '\\' && p_[1] == '\n') {
p_ += 2;
++loc_.line_;
loc_.column_ = 1;
loc_.lineBegin_ = p_;
return Peek();
}
return c;
}
// There couldn't be more than one PutBack() that
// cross two line, so just leave lineBegin, because
// we never care about the pos of newline token
void Scanner::PutBack() {
int c = *--p_;
if (c == '\n' && p_[-1] == '\\') {
--loc_.line_;
--p_;
return PutBack();
} else if (c == '\n') {
--loc_.line_;
} else {
--loc_.column_;
}
}
Token* Scanner::MakeToken(int tag) {
tok_.tag_ = tag;
auto& str = tok_.str_;
str.resize(0);
const char* p = tok_.loc_.lineBegin_ + tok_.loc_.column_ - 1;
for (; p < p_; ++p) {
if (p[0] == '\n' && p[-1] == '\\')
str.pop_back();
else
str.push_back(p[0]);
}
return Token::New(tok_);
}
/*
* New line is special, it is generated before reading the character '\n'
*/
Token* Scanner::MakeNewLine() {
tok_.tag_ = '\n';
tok_.str_ = std::string(p_, p_ + 1);
return Token::New(tok_);
}