Commit 128c8168 authored by bbguimaraes's avatar bbguimaraes
Browse files

parser

parent 3529c7b2
......@@ -7,6 +7,7 @@ for details/demos/screenshots.
and filtering.
- mem_manager: best-fit memory allocator simluation.
- nqueens: n-queens solver and graphical display.
- parser: a top-down parser for a made-up programming language.
- sockets: an implementation of the game Jungle ("Dou Shou Qi", in Chinese)
with graphics and network communication.
- sudoku: Sudoku solver library/GUI using genetic algorithms.
......
A top-down parser for a made-up programming language. See `grammar.txt`.
# Compilation
No external dependencies, use qmake for compilation:
$ qmake
$ make
# Execution
The program reads the input from a file if a file name is passed as argument or
from standard input. The input is parsed and any errors are reported on the
standard error stream. There is also a "lexer debug mode", using the `--lexer`
flag, that output each token as understood by the lexer:
$ ./parser tests/code.txt
$ echo $?
0
$ tac tests/code.txt | ./parser
1: expected TK_ALGORITMO, found fimalgoritmo(TK_FIM_ALG)
$ ./parser --lexer tests/code.txt | head -5
Token[0x7ffdeafcee50]: text(algoritmo), type(TK_ALGORITMO)
Token[0x7ffdeafcee80]: text("billy_bill"), type(TK_STRING)
Token[0x7ffdeafcee80]: text(funcao), type(TK_FUNCAO)
Token[0x7ffdeafcee80]: text(a), type(TK_IDENT)
Token[0x7ffdeafcee80]: text((), type(TK_ABRE_PAR)
<algoritmo> ::= "algoritmo" <nome> <fdec> <var> "inicio" <com> "fimalgoritmo"
<nome> ::= <string> | ""
<fdec> ::= "funcao" <func> <fdec> | "procedimento" <proc> <fdec> | ""
<func> ::= <ident> "(" <paramd> ")" ":" <tipo> <var> "inicio" <com> "fimfuncao"
<proc> ::= <ident> "(" <paramd> ")" <var> "inicio" <com> "fimprocedimento"
<paramd> ::= <dec> <paramdl> | ""
<paramdl> ::= ";" <dec> <paramdl> | ""
<var> ::= "var" <ldec> | ""
<ldec> ::= <dec> <ldec>| ""
<dec> ::= <ident> <lid> ":" <tipo> | ""
<lid> ::= "," <ident> <lid> | ""
<tipo> ::= <tiposimp> | <tipov>
<tiposimp> ::= "inteiro" | "real" | "logico" | "literal"
<tipov> ::= "vetor" <vdec>
<vdec> ::= "[" <cint> ".." <cint> <matriz> "]" "de" <tiposimp>
<matriz> ::= "," <cint> ".." <cint> | ""
<com> ::= <id> <atribfunc> <com> |
"se" <se> <com> |
"para" <para> <com> |
"repita" <repita> <com> |
"enquanto" <enquanto> <com> |
"leia" <leia> <com> |
"escreva" <escreva> <com> |
"escreval" <escreva> <com> |
"retorne" <exp>
<atribfunc> ::= "<-" <exp> | "(" <ldec> ")"
<se> ::= <exp> "entao" <com> <senao> "fimse"
<senao> ::= <com> "fimse" | ""
<para> ::= <ident> "de" <exp> "ate" <exp> "faca" <com> "fimpara"
<repita> ::= <com> "ate" <exp>
<enquanto> ::= <exp> "faca" <com> "fimenquanto"
<leia> ::= "(" <ident> ")"
<escreva> ::= "(" <exp> ")"
<exp> ::= <expou>
<expou> ::= <expe> <ou>
<ou> ::= "ou" <expou> | ""
<expe> ::= <exprel> <e>
<e> ::= "e" <expe> | ""
<exprel> ::= <expmais> <rel>
<rel> ::= "=" <exprel> | "<>" <exprel> | ">" <exprel> | "<" <exprel> | ">=" <exprel> | "<=" <exprel> | ""
<expmais> ::= <expmul> <mais>
<mais> ::= "+" <expmais> | "-" <expmais> | ""
<expmul> ::= <exppot> <mul>
<mul> ::= "*" <expmul> | "/" <expmul> | "mod" <expmul> | "div" <expmul> | ""
<exppot> ::= <t> <pot>
<pot> ::= "^" <exppot> | ""
<t> ::= <id> | "-" <t> | <cint> | <creal> | "verdadeiro" | "falso" | <string> | "(" <exp> ")"
<id> ::= <ident> <subs>
<subs> ::= "[" <exp> <subsmat> "]" | ""
<subsmat> ::= "," <exp> | ""
#ifndef LEXER_H
#define LEXER_H
#include <string>
#include <map>
#include "Token.h"
class Lexer {
public:
Lexer() : m_position(0), m_line(1) {}
Token current() const {return this->m_current;}
Token next();
unsigned int position() const {return this->m_position;}
unsigned int line() const {return this->m_line;}
bool eof() const;
void initialize(const std::string & input);
private:
Token val_keyword(unsigned int start);
Token get_token(unsigned int start, TokenType type);
unsigned int next_char_state(unsigned int state);
static const std::map<std::string, TokenType> KEYWORDS;
static const std::map<char, unsigned int> SINGLE_CHAR;
static const std::map<unsigned int, TokenType> END_STATES;
std::string m_input;
unsigned int m_position;
unsigned int m_line;
Token m_current;
};
#endif // LEXER_H
#ifndef LEXER_EXCEPTION_H
#define LEXER_EXCEPTION_H
#include <exception>
#include <sstream>
#include <string>
class LexerException : public std::exception {
std::string m_message;
public:
LexerException(const std::string & message) : m_message(message) {}
LexerException(unsigned int line, const std::string & message);
virtual ~LexerException() throw() {}
virtual const char * what() const throw()
{return this->m_message.c_str();}
};
#endif // LEXER_EXCEPTION_H
#ifndef PARSER_H
#define PARSER_H
#include <string>
#include "Lexer.h"
class Parser {
public:
Parser() {}
void parse(const std::string & input);
private:
void check_current(TokenType::Type type);
void error(TokenType expected);
void val_algoritmo();
void val_nome();
void val_fdec();
void val_func();
void val_proc();
void val_param_d();
void val_param_dl();
void val_com();
void val_var();
void val_ldec();
void val_dec();
void val_lid();
void val_tipo();
void val_tipo(bool simple);
void val_vdec();
void val_matriz();
void val_atrib_func();
void val_se();
void val_senao();
void val_para();
void val_repita();
void val_enquanto();
void val_funcao();
void val_procedimento();
void val_leia();
void val_escreva();
void val_exp();
void val_exp_ou();
void val_ou();
void val_exp_e();
void val_e();
void val_exp_rel();
void val_rel();
void val_exp_mais();
void val_mais();
void val_exp_mul();
void val_mul();
void val_exp_pot();
void val_pot();
void val_t();
void val_id();
void val_subs();
void val_subs_mat();
Lexer m_lexer;
};
#endif // PARSER_H
#ifndef PARSER_EXCEPTION_H
#define PRASER_EXCEPTION_H
#include <exception>
#include <string>
#include <sstream>
#include "Token.h"
#include "TokenType.h"
class ParserException : public std::exception {
std::string m_message;
public:
ParserException(unsigned int line, const std::string & message);
ParserException(
unsigned int line, TokenType expected, const Token & found);
virtual ~ParserException() throw() {}
virtual const char * what() const throw()
{return this->m_message.c_str();}
};
#endif // PARSER_EXCEPTION_H
#ifndef TOKEN_H
#define TOKEN_H
#include <ostream>
#include <string>
#include "TokenType.h"
class Token {
public:
Token(
const std::string & text = std::string(),
TokenType type = TokenType::TK_INVALIDO)
: m_text(text), m_type(type) {}
std::string text() const {return this->m_text;}
TokenType type() const {return this->m_type;}
private:
std::string m_text;
TokenType m_type;
friend std::ostream & operator<<(
std::ostream & os, const Token & token);
};
std::ostream & operator<<(std::ostream & os, const Token & token);
#endif // TOKEN_H
#ifndef TOKEN_TYPE_H
#define TOKEN_TYPE_H
#include <string>
#include <ostream>
class TokenType {
public:
enum Type {
TK_INVALIDO = 0,
TK_ALGORITMO = 1,
TK_VAR,
TK_INICIO,
TK_FIM_ALG,
TK_PARA,
TK_DE,
TK_ATE,
TK_FACA,
TK_FIMPARA,
TK_REPITA,
TK_ENQUANTO,
TK_FIMENQUANTO,
TK_SE,
TK_ENTAO,
TK_SENAO,
TK_FIMSE,
TK_ESCREVA,
TK_LEIA,
TK_INTEIRO,
TK_REAL,
TK_COMENT,
TK_STRING,
TK_CONST_INT,
TK_IDENT,
TK_DOIS_PONTOS,
TK_VIRGULA,
TK_ATRIB,
TK_MULT,
TK_MAIS,
TK_MENOS,
TK_DIVIDE,
TK_DIVINT,
TK_RESTO,
TK_ABRE_PAR,
TK_FECHA_PAR,
TK_MENOR,
TK_MAIOR,
TK_E,
TK_OU,
TK_IGUAL,
TK_MENORIGUAL,
TK_MAIORIGUAL,
TK_DIFERENTE,
TK_NOVA_LINHA,
TK_LITERAL,
TK_VETOR,
TK_ABRECOLCHETE,
TK_PONTOPONTO,
TK_FECHACOLCHETE,
TK_LOGICO,
TK_POTENCIACAO,
TK_CONST_REAL,
TK_ESCREVAL,
TK_FALSO,
TK_VERDADEIRO,
TK_FUNCAO,
TK_FIMFUNCAO,
TK_PROCEDIMENTO,
TK_FIMPROCEDIMENTO,
TK_PONTO_E_VIRGULA,
TK_RETORNE,
TK_NAO,
TK_PASSO,
};
TokenType() : m_type(TK_INVALIDO) {}
TokenType(Type type) : m_type(type) {}
Type type() const {return this->m_type;}
std::string toString() const;
private:
Type m_type;
friend std::ostream & operator<<(std::ostream & os, TokenType type);
};
std::ostream & operator<<(std::ostream & os, TokenType type);
bool operator==(TokenType type1, TokenType::Type type2);
bool operator!=(TokenType type1, TokenType::Type type2);
#endif // TOKEN_TYPE_H
QT =
OBJECTS_DIR = obj
DEPENDPATH = include
INCLUDEPATH = include
HEADERS = include/*.h
SOURCES = src/*.cpp
#include "Lexer.h"
#include <algorithm>
#include <cassert>
#include <string>
#include "LexerException.h"
/*static*/
const std::map<std::string, TokenType> Lexer::KEYWORDS{
{"algoritmo", TokenType::TK_ALGORITMO},
{"ate", TokenType::TK_ATE},
{"de", TokenType::TK_DE},
{"entao", TokenType::TK_ENTAO},
{"enquanto", TokenType::TK_ENQUANTO},
{"faca", TokenType::TK_FACA},
{"fimalgoritmo", TokenType::TK_FIM_ALG},
{"fimenquanto", TokenType::TK_FIMENQUANTO},
{"fimfuncao", TokenType::TK_FIMFUNCAO},
{"fimpara", TokenType::TK_FIMPARA},
{"fimprocedimento", TokenType::TK_FIMPROCEDIMENTO},
{"fimse", TokenType::TK_FIMSE},
{"funcao", TokenType::TK_FUNCAO},
{"inicio", TokenType::TK_INICIO},
{"inteiro", TokenType::TK_INTEIRO},
{"literal", TokenType::TK_LITERAL},
{"logico", TokenType::TK_LOGICO},
{"para", TokenType::TK_PARA},
{"real", TokenType::TK_REAL},
{"repita", TokenType::TK_REPITA},
{"retorne", TokenType::TK_RETORNE},
{"se", TokenType::TK_SE},
{"senao", TokenType::TK_SENAO},
{"var", TokenType::TK_VAR},
{"vetor", TokenType::TK_VETOR},
{"escreva", TokenType::TK_ESCREVA},
{"escreval", TokenType::TK_ESCREVAL},
{"leia", TokenType::TK_LEIA},
{"verdadeiro", TokenType::TK_VERDADEIRO},
{"falso", TokenType::TK_FALSO},
{"mod", TokenType::TK_DIVINT},
{"div", TokenType::TK_DIVIDE},
{"e", TokenType::TK_E},
{"ou", TokenType::TK_OU},
};
/*static*/
const std::map<char, unsigned int> Lexer::SINGLE_CHAR{
{'<', 5},
{'>', 9},
{'=', 11},
{'-', 12},
{'+', 13},
{'*', 14},
{'/', 15},
{':', 17},
{'"', 18},
{',', 20},
{'(', 21},
{')', 22},
{'[', 23},
{']', 24},
{'^', 25},
{'.', 26},
{';', 28},
};
/*static*/
const std::map<unsigned int, TokenType> Lexer::END_STATES{
{6, TokenType::TK_ATRIB},
{7, TokenType::TK_MENORIGUAL},
{8, TokenType::TK_DIFERENTE},
{10, TokenType::TK_MAIORIGUAL},
{11, TokenType::TK_IGUAL},
{12, TokenType::TK_MENOS},
{13, TokenType::TK_MAIS},
{14, TokenType::TK_MULT},
{17, TokenType::TK_DOIS_PONTOS},
{19, TokenType::TK_STRING},
{20, TokenType::TK_VIRGULA},
{21, TokenType::TK_ABRE_PAR},
{22, TokenType::TK_FECHA_PAR},
{23, TokenType::TK_ABRECOLCHETE},
{24, TokenType::TK_FECHACOLCHETE},
{25, TokenType::TK_POTENCIACAO},
{27, TokenType::TK_PONTOPONTO},
{28, TokenType::TK_PONTO_E_VIRGULA},
};
void Lexer::initialize(const std::string & input) {
this->m_input = input;
this->m_position = 0;
this->next();
}
bool Lexer::eof() const {
unsigned int i = this->m_position;
while(i <= this->m_input.size() && (
this->m_input[i] == '\n'
|| this->m_input[i] == '\t'
|| this->m_input[i] == ' '))
++i;
return i == this->m_input.size();
}
Token Lexer::val_keyword(unsigned int start) {
std::string token = this->m_input.substr(start, this->m_position - start);
std::string token_to_lower(token.length(), 0);
std::transform(
std::begin(token),
std::end(token),
std::begin(token_to_lower),
tolower);
auto it = KEYWORDS.find(token_to_lower);
if(it != std::end(KEYWORDS))
return this->m_current = Token(token, it->second);
else
return this->m_current = Token(token, TokenType::TK_IDENT);
}
Token Lexer::get_token(unsigned int start, TokenType type) {
return this->m_current = Token(
this->m_input.substr(start, this->m_position - start),
type);
}
unsigned int Lexer::next_char_state(unsigned int state) {
++this->m_position;
return state;
}
Token Lexer::next() {
if(this->m_position == this->m_input.size())
throw LexerException("EOF");
const unsigned int start = this->m_position;
unsigned int state = 0;
while(true) {
char c = this->m_input[this->m_position];
switch(state) {
case 0:
if(c == '\n') {
++this->m_line;
++this->m_position;
return m_current = next();
}
if(c == ' ' || c == '\t' || c == '\r') {
++this->m_position;
return m_current = next();
}
if(isalpha(c) || c == '_')
state = this->next_char_state(1);
else if(isdigit(c))
state = this->next_char_state(2);
else {
auto it = this->SINGLE_CHAR.find(c);
if(it != std::end(this->SINGLE_CHAR))
state = this->next_char_state(it->second);
else
throw LexerException(
this->m_line,
std::string("Unknown symbol: '") + c + "'");
}
break;
case 1:
if(!std::isalnum(c) && c != '_')
return this->val_keyword(start);
++this->m_position;
break;
case 2:
if(isdigit(c))
++this->m_position;
else if(c == '.')
state = this->next_char_state(3);
else
return this->get_token(start, TokenType::TK_CONST_INT);
break;
case 3:
if(c == '.') {
--this->m_position;
return this->get_token(start, TokenType::TK_CONST_INT);
} else if(std::isdigit(c))
state = this->next_char_state(4);
else
throw LexerException(
this->m_line,
"Expected digit after point (e.g. '1.' is not valid)");
break;
case 4:
if(isdigit(c))
++this->m_position;
else
return this->get_token(start, TokenType::TK_CONST_REAL);
break;
case 5:
if(c == '-')
state = this->next_char_state(6);
else if(c == '=')
state = this->next_char_state(7);
else if(c == '>')
state = this->next_char_state(8);
else
return this->get_token(start, TokenType::TK_MENOR);
break;
case 9:
if(c == '=')
state = this->next_char_state(10);
else
return this->get_token(start, TokenType::TK_MAIOR);
break;
case 15:
if(c == '/')
state = this->next_char_state(16);
else
return this->get_token(start, TokenType::TK_DIVIDE);
break;
case 16:
if(c != '\n')
++this->m_position;
else {
++this->m_position;
return this->m_current = next();
}
break;
case 18:
if(c != '"')
++this->m_position;
else