/* * File: scanner.l * ---------------- * Lex inupt file to generate the scanner for the compiler. */ %{ #include "scanner.h" #include "utility.h" // for PrintDebug() #include "errors.h" #include #include "parser.h" #define TAB_SIZE 8 /* Global variables * ---------------- * (For shame!) But we need a few to keep track of things that are * preserved between calls to yylex or used outside the scanner. */ static int curLineNum, curColNum; static char curLine[512]; static void DoBeforeEachAction(); #define YY_USER_ACTION DoBeforeEachAction(); %} /* States * ------ * Our strategy for handling nested comments uses two lex states (N & C) * N = Normal (not inside a comment, we start in this state) * C = Comment (currently inside a comment) * Both are inclusive states (i.e. apply when explicitly named or none named) * Most rules will trigger when in normal mode, e.g. processing keywords * and identifiers only happens outside a comment. A few rules apply * when inside a comment (end-comment, EOF), and a few rules are used * in both states (ignoring whitespace, counting newlines, starting * another comment). To track nesting depth, we turn on the stack option * so we can use lex's state stack. Each time we find a comment start, * we push a comment state, each time we find an end-comment, we pop. * Eventually this will return to the normal state in which we started. * (We could have also have tracked this with our own integer counter). * Another little wrinkle on states is the COPY exclusive state which * I added to first match each line and copy it to a saved buffer * before re-processing it. This allows us to print the entire line * to provide context on errors. */ %option stack %s N C %x COPY /* Definitions * ----------- * To make our rules more readable, we establish some definitions here. */ DIGIT ([0-9]) HEX_DIGIT ([0-9a-fA-F]) HEX_INTEGER (0[Xx]{HEX_DIGIT}+) INTEGER ({DIGIT}+) EXPONENT ([Ee][-+]?{INTEGER}) DOUBLE ({INTEGER}"."{DIGIT}*{EXPONENT}?) BEG_STRING (\"[^"\n]*) STRING ({BEG_STRING}\") IDENTIFIER ([a-zA-Z][a-zA-Z_0-9]*) OPERATOR ([+\-*/%=\\.,;!()\[\]{}]) BEG_COMMENT ("/*") END_COMMENT ("*/") SINGLE_COMMENT ("//"[^\n]*) %% /* BEGIN RULES SECTION */ .* { strncpy(curLine, yytext, sizeof(curLine)); curColNum = 1; yy_pop_state(); yyless(0); } { yy_pop_state();} \n { curLineNum++; curColNum = 1; if (YYSTATE != COPY) yy_push_state(COPY); } [ ]+ { /* ignore all spaces in normal or comment */ } [\t] { curColNum += TAB_SIZE - curColNum%TAB_SIZE + 1; } /* -------------------- Comments ----------------------------- */ {BEG_COMMENT} { yy_push_state(C); } {END_COMMENT} { yy_pop_state(); } { ReportError(&yylloc, err_unterm_comment); return 0; } [^*\n\t/]* { /* grab all non-star, non-slash, non-newline */} . { /* ignore everything else that doesn't match */ } {SINGLE_COMMENT} { /* skip to end of line for // comment */ } /* --------------------- Keywords ------------------------------- */ "void" { return T_Void; } "int" { return T_Int; } "double" { return T_Double; } "bool" { return T_Bool; } "string" { return T_String; } "null" { return T_Null; } "class" { return T_Class; } "extends" { return T_Extends; } "this" { return T_This; } "while" { return T_While; } "for" { return T_For; } "if" { return T_If; } "else" { return T_Else; } "return" { return T_Return; } "break" { return T_Break; } "New" { return T_New; } "NewArray" { return T_NewArray; } "Print" { return T_Print; } "ReadInteger" { return T_ReadInteger; } "ReadLine" { return T_ReadLine; } /* -------------------- Operators ----------------------------- */ " ">=" { return T_GreaterEqual;} "==" { return T_Equal; } "!=" { return T_NotEqual; } "&&" { return T_And; } "||" { return T_Or; } {OPERATOR} { return yytext[0]; } /* -------------------- Constants ------------------------------ */ "true"|"false" { yylval.boolConstant = (yytext[0] == 't'); return T_BoolConstant; } {INTEGER} { yylval.integerConstant = strtol(yytext, NULL, 10); return T_IntConstant; } {HEX_INTEGER} { yylval.integerConstant = strtol(yytext, NULL, 16); return T_IntConstant; } {DOUBLE} { yylval.doubleConstant = atof(yytext); return T_DoubleConstant; } {STRING} { yylval.stringConstant = strdup(yytext); return T_StringConstant; } {BEG_STRING} { ReportError(&yylloc, err_unterm_string, yytext); } /* -------------------- Identifiers --------------------------- */ {IDENTIFIER} { /* The lexer records the identifier name in yylval. * The parser is reponsible for looking up the name * in the appropriate scope(s) to find the decl. */ strncpy(yylval.identifier, yytext, sizeof(yylval.identifier)-1); return T_Identifier; } /* -------------------- Default rule (error) -------------------- */ . { ReportError(&yylloc, err_unrecog_char, yytext[0]); } %% int yywrap() { return 1; } /* * Function: Inityylex() * -------------------- * This function will be called before any calls to yylex(). It is designed * to give you an opportunity to do anything that must be done to initialize * the scanner (set global variables, configure starting state, etc.). One * thing it already does for you is assign the value of the global variable * yy_flex_debug that controls whether flex prints debugging information * about each token and what rule was matched. If set to false, no information * is printed. Setting it to true will give you a running trail that might * be helpful when debugging your scanner. Please be sure the variable is * set to false when submitting your final version. */ void Inityylex() { PrintDebug("lex", "Initializing scanner"); yy_flex_debug = false; BEGIN(N); // Start in Normal state yy_push_state(COPY); // but copy first line at start curLineNum = 1; curColNum = 1; } /* * Function: DoBeforeEachAction() * ------------------------------ * This function is installed as the YY_USER_ACTION. This is a place * to group code common to all actions. * On each match, we fill in the fields to record its location and * update our column counter. */ static void DoBeforeEachAction() { yylloc.first_line = curLineNum; yylloc.first_column = curColNum; yylloc.last_column = curColNum + yyleng - 1; curColNum += yyleng; } /* Function: GetLineNumbered() * --------------------------- * Returns string with contents of line numbered n or NULL if the * contents of that line are no longer available. Basically only the * line currently being scanned is available, although we could keep * a few lines back if we put more effort into it :-). The pointer * returned is to an internally maintained static buffer which will * be overwritten. If you want to preserve, be sure to copy elsewhere. */ const char *GetLineNumbered(int num) { return (num == curLineNum) ? curLine : NULL; }