#ifdef WIN32 #pragma warning(disable: 4786) // Stop warnings about truncated identifiers in debug info #endif #include "lexer.h" #include "parser.h" #include "../utils/exceptionstream.h" #include "../utils/debug.h" #include #include #include #include #include #include // (1) turns debug output off, (0) turns debug output on. //#define debug (0) ? (void)std::cerr : std::cerr using namespace parser; parser::lexer_t::FailedInfo::FailedInfo( const std::type_info& type0, const char* description0, const char* pos0, const char* backtrackpos0) : type( &type0), description( (description0) ? description0 : ""), backtrackpos( backtrackpos0), pos( pos0) { //debug << "Created FailedInfo: backtrackpos=" << ((void*)backtrackpos) << ", pos=" << ((void*)pos) << "\n"; assert( pos>=backtrackpos); } namespace { std::ostream& OutputLinePos( std::ostream& out, int column, const std::string& line) /* Prints a `^' in column `column', taking care to match the tabs in `line'. Tabs are assumed to align to the next 8-character column. */ { if ( column > (int) line.size()) { std::cerr << "** correcting internal error - column " << column << " in line of length " << line.size() << "\n"; column = line.size(); } for ( int i=0, col=0; idescription() << "'\n"; } else { if ( lexer.GetFailedTerminals().empty()) out << "No known failed terminals"; else { const char* last_error_pos = NULL; const char* last_linestart = NULL; const char* last_backtrackpos = NULL; std::string line; lexer_t::FailedTerminals::const_iterator it; if ( lexer.show_all_errorpos) { it = lexer.GetFailedTerminals().begin(); } else { /* We show only the last of failed terminals that have same pos. Probably a way of doing this using algorithms, but I can't be bothered writing all that mem_fn_ref stuff... */ const char* lastpos = NULL; for ( lexer_t::FailedTerminals::const_iterator it2 = lexer.GetFailedTerminals().begin(); it2!=lexer.GetFailedTerminals().end(); ++it2) { if ( it2->pos != lastpos) { lastpos = it2->pos; it = it2; } } } for ( /**/; it!=lexer.GetFailedTerminals().end(); ++it) { const char* backtrackpos = it->backtrackpos; const char* linestart = lexer.GetLinePos( it->pos); if ( backtrackpos != last_backtrackpos || it==lexer.GetFailedTerminals().begin()) { //const char* backtrackpos_linestart = lexer.GetLinePos( backtrackpos); std::string backtrackpos_line = lexer.GetLine( backtrackpos); if ( lexer.show_startpos) { out << "\nstarting at:\n" << backtrackpos_line << "\n"; //debug << __FILE__ << ":" << __LINE__ << ", linepos=" << backtrackpos-linestart << ", it->backtracepos=" << ((void*)it->backtrackpos) << ", it->pos=" << ((void*)it->pos) << "\n"; OutputLinePos( out, backtrackpos-linestart, backtrackpos_line); } last_backtrackpos = backtrackpos; } if ( linestart != last_linestart || it==lexer.GetFailedTerminals().begin()) { line = lexer.GetLine( it->pos); if ( lexer.show_startpos) { out << "\nat:\n"; } out << line; last_linestart = linestart; // should probably output interveening lines too if errors are long way apart. } if ( it->pos != last_error_pos) { out << "\n"; //debug << __FILE__ << ":" << __LINE__ << ", linepos=" << it->pos - linestart << "\n"; OutputLinePos( out, it->pos - linestart, line); last_error_pos = it->pos; } if ( lexer.show_failed_terminals) { out << " `" << it->description << "'"; } } } } return out; } } namespace parser { error_t::error_t( lexer_t& lexer, const std::string& text_, bool dont_call_peeknext) : text() { //debug << "error_t::error_t:\n"; //debug_output_backtraces(); std::stringstream ss; AddPosInfo( ss, lexer, text_, dont_call_peeknext); ss << "\n"; #ifndef NDEBUG ss << "Backtrace is:\n"; debug_output_backtraces( ss); #endif this->text = ss.str(); } error_t::error_t( lexer_t& lexer, const parser::node_t& node, bool dont_call_peeknext) : text( NULL) { //debug << "error_t::error_t:\n"; //debug_output_backtraces(); std::stringstream ss; std::string message = "Can't handle node: " + node.output_tostring(); AddPosInfo( ss, lexer, message, dont_call_peeknext); this->text = ss.str(); } const char* error_t::what() const throw() { return this->text.c_str(); } } namespace { inline void EatWhite( const char*& text) { while ( *text && isspace( *text)) ++text; } inline bool IsAl_( char c) { return c=='_' || isalpha( c); } inline bool IsAlNum_( char c) { return c=='_' || isalnum( c); } inline int StartsWith( const char* text, const char* first) /* if_t text starts with the complete string 'first', returns length of first. Else returns 0. Eg. StartsWith( "abcdefghijk", "abc") returns 3. */ { //Profile::HierScope prof( "StartsWith"); int len; for ( len=0; *text==*first && *text; ++text, ++first, ++len) {} if ( *first==0) return len; else return 0; } inline bool Match( const char*& text, const char* first) /* if_t text starts with first, increments text by length of first and returns true. Else returns false with text unchanged. Eg. char* text="abcdefghijklm"; char* first="abc"; Match( text, first) increments text by 3, and returns true. */ { int len = StartsWith( text, first); text += len; return (len>0) ? true : false; } inline bool MatchAlNum_( const char*& text, const char* name) /* As Match, but only succeeds if next character in text is not alphanumeric or '_'. Used for matching identifiers: eg text="abcdef.foo", name="abcdef" - succeeds text="abcdef.foo", name="abc" - fails. */ { int len = StartsWith( text, name); if ( len && !IsAlNum_( text[len])) { text += len; return true; } return false; } template< class T> inline keyword_t* make_keyword( const char* begin, const char* end) { return new T( begin, end); } typedef keyword_t* (*keyword_factory_t)( const char* begin, const char* end); struct index_t; struct index_item { index_item() : factory( NULL), next( NULL), alnum_( false) {} keyword_factory_t factory; // null if this isn't end of a keyword. index_t* next; bool alnum_; }; struct index_t { index_item items[ 256]; }; struct keyword_map_t { index_item index; }; void keyword_add( keyword_map_t& map, const std::string& text, keyword_factory_t factory, bool alnum_) { index_item* in=&map.index; for ( int i=0; i<(int)text.size(); ++i) { if ( !in->next) { in->next = new index_t; } in = &in->next->items[ text[i]]; } if ( in->factory) { debug << debug_PLACE << "keyword_add: text=" << text << " is already in map\n"; } assert( !in->factory); in->factory = factory; in->alnum_ = alnum_; debug0 << debug_PLACE << "have added keyword " << text << "\n"; } template< class T> void keyword_add( keyword_map_t& map, bool alnum_) { keyword_add( map, T::name_t(), make_keyword< T>, alnum_); } keyword_t* keyword_lookup( keyword_map_t& map, const char*& text) { debug0 << debug_PLACE << "\n"; keyword_factory_t best_factory = NULL; const char* best_c = NULL; index_item* in=&map.index; const char* c; for ( c=text; *c; ++c) { debug0 << debug_PLACE << "keyword_lookup: looking at char `" << *c << "', factory=" << in->factory << "\n"; if ( !in->next) break; in = &in->next->items[ *c]; if ( in->factory) { debug0 << debug_PLACE << "keyword_lookup: factory is non-null\n"; /* this is only a possible terminator if not alnum, or next char is non-alnum. */ if ( !in->alnum_ || !IsAlNum_( c[1])) { best_factory = in->factory; best_c = c; } } } if ( best_factory) { const char* begin = text; text = best_c+1; keyword_t* ret = best_factory( begin, text); debug0 << debug_PLACE << "keyword_lookup: returning " << *ret << "\n"; return ret; } debug0 << debug_PLACE << "keyword_lookup: returning NULL\n"; return NULL; } void initialise_map_c( keyword_map_t& map) { //keyword_add< keyword_struct>( map, true); keyword_add( map, "struct", make_keyword< keyword_struct>, true); keyword_add( map, "private", make_keyword< keyword_private>, true); keyword_add( map, "enum", make_keyword< keyword_enum>, true); keyword_add( map, "for", make_keyword< keyword_for>, true); keyword_add( map, "while", make_keyword< keyword_while>, true); keyword_add( map, "const", make_keyword< keyword_const>, true); keyword_add( map, "volatile", make_keyword< keyword_volatile>, true); keyword_add( map, "char", make_keyword< keyword_char>, true); keyword_add( map, "int", make_keyword< keyword_int>, true); keyword_add( map, "double", make_keyword< keyword_double>, true); keyword_add( map, "float", make_keyword< keyword_float>, true); keyword_add( map, "long", make_keyword< keyword_long>, true); keyword_add( map, "short", make_keyword< keyword_short>, true); keyword_add( map, "typedef", make_keyword< keyword_typedef>, true); keyword_add( map, "inline", make_keyword< keyword_inline>, true); keyword_add( map, "extern", make_keyword< keyword_extern>, true); keyword_add( map, "signed", make_keyword< keyword_signed>, true); keyword_add( map, "unsigned", make_keyword< keyword_unsigned>, true); keyword_add( map, "static", make_keyword< keyword_static>, true); keyword_add( map, "union", make_keyword< keyword_union>, true); keyword_add( map, "return", make_keyword< keyword_return>, true); keyword_add( map, "if", make_keyword< keyword_if>, true); keyword_add( map, "else", make_keyword< keyword_else>, true); keyword_add( map, "switch", make_keyword< keyword_switch>, true); keyword_add( map, "case", make_keyword< keyword_case>, true); keyword_add( map, "default", make_keyword< keyword_default>, true); keyword_add( map, "do", make_keyword< keyword_do>, true); keyword_add( map, "mutable", make_keyword< keyword_mutable>, true); keyword_add( map, "register", make_keyword< keyword_register>, true); keyword_add( map, "void", make_keyword< keyword_void>, true); keyword_add( map, "asm", make_keyword< keyword_asm>, true); keyword_add( map, "sizeof", make_keyword< keyword_sizeof>, true); keyword_add( map, keyword_COLONCOLON::name_t(), make_keyword< keyword_COLONCOLON>, false); keyword_add( map, keyword_COLON::name_t(), make_keyword< keyword_COLON>, false); keyword_add( map, keyword_OPENROUND::name_t(), make_keyword< keyword_OPENROUND>, false); keyword_add( map, keyword_CLOSEROUND::name_t(), make_keyword< keyword_CLOSEROUND>, false); keyword_add( map, keyword_OPENSQUARE::name_t(), make_keyword< keyword_OPENSQUARE>, false); keyword_add( map, keyword_CLOSESQUARE::name_t(), make_keyword< keyword_CLOSESQUARE>, false); keyword_add( map, keyword_OPENCURLY::name_t(), make_keyword< keyword_OPENCURLY>, false); keyword_add( map, keyword_CLOSECURLY::name_t(), make_keyword< keyword_CLOSECURLY>, false); keyword_add( map, keyword_QUESTION::name_t(), make_keyword< keyword_QUESTION>, false); keyword_add( map, keyword_SEMICOLON::name_t(), make_keyword< keyword_SEMICOLON>, false); keyword_add( map, keyword_COMMA::name_t(), make_keyword< keyword_COMMA>, false); keyword_add( map, keyword_DOTDOTDOT::name_t(), make_keyword< keyword_DOTDOTDOT>, false); keyword_add( map, keyword_DOT::name_t(), make_keyword< keyword_DOT>, false); keyword_add( map, keyword_ARROW::name_t(), make_keyword< keyword_ARROW>, false); keyword_add( map, keyword_AMPERSAND::name_t(), make_keyword< keyword_AMPERSAND>, false); keyword_add( map, keyword_GTGT::name_t(), make_keyword< keyword_GTGT>, false); keyword_add( map, keyword_GTGTEQ::name_t(), make_keyword< keyword_GTGTEQ>, false); keyword_add( map, keyword_GT::name_t(), make_keyword< keyword_GT>, false); keyword_add( map, keyword_GTEQ::name_t(), make_keyword< keyword_GTEQ>, false); keyword_add( map, keyword_LTLT::name_t(), make_keyword< keyword_LTLT>, false); keyword_add( map, keyword_LTLTEQ::name_t(), make_keyword< keyword_LTLTEQ>, false); keyword_add( map, keyword_LT::name_t(), make_keyword< keyword_LT>, false); keyword_add( map, keyword_LTEQ::name_t(), make_keyword< keyword_LTEQ>, false); keyword_add( map, keyword_TILDE::name_t(), make_keyword< keyword_TILDE>, false); keyword_add( map, keyword_TILDEEQ::name_t(), make_keyword< keyword_TILDEEQ>, false); keyword_add( map, keyword_EQ::name_t(), make_keyword< keyword_EQ>, false); keyword_add( map, keyword_EQEQ::name_t(), make_keyword< keyword_EQEQ>, false); keyword_add( map, keyword_NOT::name_t(), make_keyword< keyword_NOT>, false); keyword_add( map, keyword_NOTEQ::name_t(), make_keyword< keyword_NOTEQ>, false); keyword_add( map, keyword_AMPAMP::name_t(), make_keyword< keyword_AMPAMP>, false); keyword_add( map, keyword_AMPAMPEQ::name_t(), make_keyword< keyword_AMPAMPEQ>, false); keyword_add( map, keyword_AMP::name_t(), make_keyword< keyword_AMP>, false); keyword_add( map, keyword_AMPEQ::name_t(), make_keyword< keyword_AMPEQ>, false); keyword_add( map, keyword_OROR::name_t(), make_keyword< keyword_OROR>, false); keyword_add( map, keyword_OROREQ::name_t(), make_keyword< keyword_OROREQ>, false); keyword_add( map, keyword_OR::name_t(), make_keyword< keyword_OR>, false); keyword_add( map, keyword_OREQ::name_t(), make_keyword< keyword_OREQ>, false); keyword_add( map, keyword_XOR::name_t(), make_keyword< keyword_XOR>, false); keyword_add( map, keyword_XOREQ::name_t(), make_keyword< keyword_XOREQ>, false); keyword_add( map, keyword_DIV::name_t(), make_keyword< keyword_DIV>, false); keyword_add( map, keyword_DIVEQ::name_t(), make_keyword< keyword_DIVEQ>, false); keyword_add( map, keyword_STAR::name_t(), make_keyword< keyword_STAR>, false); keyword_add( map, keyword_STAREQ::name_t(), make_keyword< keyword_STAREQ>, false); keyword_add( map, keyword_PLUSPLUS::name_t(), make_keyword< keyword_PLUSPLUS>, false); keyword_add( map, keyword_PLUSPLUSEQ::name_t(), make_keyword< keyword_PLUSPLUSEQ>, false); keyword_add( map, keyword_MINUSMINUS::name_t(), make_keyword< keyword_MINUSMINUS>, false); keyword_add( map, keyword_MINUSMINUSEQ::name_t(), make_keyword< keyword_MINUSMINUSEQ>, false); keyword_add( map, keyword_PLUS::name_t(), make_keyword< keyword_PLUS>, false); keyword_add( map, keyword_PLUSEQ::name_t(), make_keyword< keyword_PLUSEQ>, false); keyword_add( map, keyword_MINUS::name_t(), make_keyword< keyword_MINUS>, false); keyword_add( map, keyword_MINUSEQ::name_t(), make_keyword< keyword_MINUSEQ>, false); keyword_add( map, keyword_PERCENT::name_t(), make_keyword< keyword_PERCENT>, false); keyword_add( map, keyword_PERCENTEQ::name_t(), make_keyword< keyword_PERCENTEQ>, false); } void initialise_map_cpp( keyword_map_t& map) { initialise_map_c( map); keyword_add( map, "bool", make_keyword< keyword_bool>, true); keyword_add( map, "class", make_keyword< keyword_class>, true); keyword_add( map, "protected", make_keyword< keyword_protected>, true); keyword_add( map, "public", make_keyword< keyword_public>, true); keyword_add( map, "virtual", make_keyword< keyword_virtual>, true); keyword_add( map, "operator", make_keyword< keyword_operator>, true); keyword_add( map, "namespace", make_keyword< keyword_namespace>, true); keyword_add( map, "template", make_keyword< keyword_template>, true); keyword_add( map, "typename", make_keyword< keyword_typename>, true); keyword_add( map, "using", make_keyword< keyword_using>, true); keyword_add( map, "new", make_keyword< keyword_new>, true); keyword_add( map, "throw", make_keyword< keyword_throw>, true); keyword_add( map, "delete", make_keyword< keyword_delete>, true); keyword_add( map, "friend", make_keyword< keyword_friend>, true); keyword_add( map, "explicit", make_keyword< keyword_explicit>, true); keyword_add( map, "catch", make_keyword< keyword_catch>, true); keyword_add( map, "try", make_keyword< keyword_try>, true); } keyword_t* get_keyword_c( const char*& text) { static keyword_map_t map; static bool initialised = false; if ( !initialised) { initialised = true; initialise_map_c( map); } keyword_t* ret = keyword_lookup( map, text); if ( ret) { debug0 << debug_PLACE << "found keyword " << *ret << "\n"; } return ret; } keyword_t* get_keyword_cpp( const char*& text) { static keyword_map_t map; static bool initialised = false; if ( !initialised) { initialised = true; initialise_map_cpp( map); } keyword_t* ret = keyword_lookup( map, text); if ( ret) { debug0 << debug_PLACE << "found keyword " << *ret << "\n"; } return ret; } keyword_t* get_keyword( const char*& text, bool no_cplusplus) { if ( no_cplusplus) return get_keyword_c( text); else return get_keyword_cpp( text); } keyword_t* GetAlNumKeyword( const char*& text, bool no_cplusplus) { if ( no_cplusplus) { throw std::runtime_error( "no_cplusplus not currently supoprted in GetAlNumKeyword"); } std::string alnum; for ( const char* c=text; IsAlNum_( *c); ++c) { alnum += *c; } if ( alnum.size()==0) return NULL; typedef std::map< std::string, keyword_t* (*)( const char* begin, const char* end)> keywords_cplusplus_t; static keywords_cplusplus_t keywords_cplusplus; if ( keywords_cplusplus.size()==0) { keywords_cplusplus[ "struct"] = make_keyword< keyword_struct>; keywords_cplusplus[ "private"] = make_keyword< keyword_private>; keywords_cplusplus[ "enum"] = make_keyword< keyword_enum>; keywords_cplusplus[ "for"] = make_keyword< keyword_for>; keywords_cplusplus[ "while"] = make_keyword< keyword_while>; keywords_cplusplus[ "const"] = make_keyword< keyword_const>; keywords_cplusplus[ "volatile"] = make_keyword< keyword_volatile>; keywords_cplusplus[ "char"] = make_keyword< keyword_char>; keywords_cplusplus[ "int"] = make_keyword< keyword_int>; keywords_cplusplus[ "double"] = make_keyword< keyword_double>; keywords_cplusplus[ "float"] = make_keyword< keyword_float>; keywords_cplusplus[ "long"] = make_keyword< keyword_long>; keywords_cplusplus[ "short"] = make_keyword< keyword_short>; keywords_cplusplus[ "typedef"] = make_keyword< keyword_typedef>; keywords_cplusplus[ "inline"] = make_keyword< keyword_inline>; keywords_cplusplus[ "extern"] = make_keyword< keyword_extern>; keywords_cplusplus[ "signed"] = make_keyword< keyword_signed>; keywords_cplusplus[ "unsigned"] = make_keyword< keyword_unsigned>; keywords_cplusplus[ "static"] = make_keyword< keyword_static>; keywords_cplusplus[ "union"] = make_keyword< keyword_union>; keywords_cplusplus[ "return"] = make_keyword< keyword_return>; keywords_cplusplus[ "if"] = make_keyword< keyword_if>; keywords_cplusplus[ "else"] = make_keyword< keyword_else>; keywords_cplusplus[ "switch"] = make_keyword< keyword_switch>; keywords_cplusplus[ "case"] = make_keyword< keyword_case>; keywords_cplusplus[ "default"] = make_keyword< keyword_default>; keywords_cplusplus[ "do"] = make_keyword< keyword_do>; keywords_cplusplus[ "mutable"] = make_keyword< keyword_mutable>; keywords_cplusplus[ "register"] = make_keyword< keyword_register>; keywords_cplusplus[ "void"] = make_keyword< keyword_void>; keywords_cplusplus[ "asm"] = make_keyword< keyword_asm>; keywords_cplusplus[ "sizeof"] = make_keyword< keyword_sizeof>; keywords_cplusplus[ "bool"] = make_keyword< keyword_bool>; keywords_cplusplus[ "class"] = make_keyword< keyword_class>; keywords_cplusplus[ "protected"]= make_keyword< keyword_protected>; keywords_cplusplus[ "public"] = make_keyword< keyword_public>; keywords_cplusplus[ "virtual"] = make_keyword< keyword_virtual>; keywords_cplusplus[ "operator"] = make_keyword< keyword_operator>; keywords_cplusplus[ "namespace"]= make_keyword< keyword_namespace>; keywords_cplusplus[ "template"] = make_keyword< keyword_template>; keywords_cplusplus[ "typename"] = make_keyword< keyword_typename>; keywords_cplusplus[ "using"] = make_keyword< keyword_using>; keywords_cplusplus[ "new"] = make_keyword< keyword_new>; keywords_cplusplus[ "throw"] = make_keyword< keyword_throw>; keywords_cplusplus[ "delete"] = make_keyword< keyword_delete>; keywords_cplusplus[ "friend"] = make_keyword< keyword_friend>; keywords_cplusplus[ "explicit"] = make_keyword< keyword_explicit>; keywords_cplusplus[ "catch"] = make_keyword< keyword_catch>; keywords_cplusplus[ "try"] = make_keyword< keyword_try>; } keywords_cplusplus_t::iterator it = keywords_cplusplus.find( alnum); if ( it == keywords_cplusplus.end()) return NULL; const char* begin = text; text += alnum.size(); return it->second( begin, text); } keyword_t* GetAlNumKeyword0( const char*& text, bool no_cplusplus) /* Looks for an alphanumeric keyword that matches the start of text, with the following character being non-alphanumeric_. if_t one is found, text is incremented to point to next unused character.*/ { if ( text[0]==0) return NULL; const char* begin = text; #define JOIN( a, b) a ## b #define g( id, name) if ( MatchAlNum_( text, name)) return new JOIN( keyword_, id)( begin, text); #define f( name) g( name, #name) extern void cmm_pragma_detailedparse_off(); { f( struct) f( private) f( enum) f( for) f( while) f( const) f( volatile) f( char) f( int) f( double) f( float) f( long) f( short) f( typedef) f( inline) f( extern) f( signed) f( unsigned) f( static) f( union) f( return) f( if) f( else) f( switch) f( case) f( default) f( do) f( mutable) f( register) f( void) f( asm) f( sizeof) if (!no_cplusplus) { f( bool) f( class) f( protected) f( public) f( virtual) f( operator) f( namespace) f( template) f( typename) f( using) f( new) f( throw) f( delete) f( friend) f( explicit) f( catch) f( try) } } extern void cmm_pragma_detailedparse_on(); #undef f #undef g #undef JOIN return NULL; } keyword_t* GetNonAlNumKeyword( const char*& text) /* Looks for a non-alphanumeric keyword that matches start of text. if_t one is found, text is incremented to point to next unused character. */ { if ( text[0]==0) return NULL; const char* begin = text; #define k( type) if ( Match( text, type::name_t())) return new type( begin, text); #define JOIN( a, b) a ## b extern void cmm_pragma_detailedparse_off(); { k( keyword_COLONCOLON); k( keyword_COLON); k( keyword_OPENROUND); k( keyword_CLOSEROUND); k( keyword_OPENSQUARE); k( keyword_CLOSESQUARE); k( keyword_OPENCURLY); k( keyword_CLOSECURLY); k( keyword_QUESTION); k( keyword_SEMICOLON); k( keyword_COMMA); k( keyword_DOTDOTDOT); k( keyword_DOT); k( keyword_ARROW); k( keyword_AMPERSAND); // Next few keywords can have a '=' after them. Have to look for '*=' before '*' for example: #define keq( name)\ k( JOIN( name, EQ))\ k( name) keq( keyword_GTGT); keq( keyword_GT); keq( keyword_LTLT); keq( keyword_LT); keq( keyword_TILDE); keq( keyword_EQ); keq( keyword_NOT); keq( keyword_AMPAMP); keq( keyword_AMP); keq( keyword_OROR); keq( keyword_OR); keq( keyword_XOR); keq( keyword_DIV); keq( keyword_STAR); keq( keyword_PLUSPLUS); keq( keyword_MINUSMINUS); keq( keyword_PLUS); keq( keyword_MINUS); keq( keyword_PERCENT); } extern void cmm_pragma_detailedparse_on(); #undef keq #undef k #undef JOIN #undef k0 return NULL; } const char* GetLineStart( const char* start, const char* pos) { const char* ret; assert( pos>=start); if ( pos==start) return start; for ( const char* s=pos-1; ; --s) { if ( *s=='\n') { ret = s+1; break; } if ( s==start) { ret = start; break; } } return ret; } int GetIndentation( const char* start, const char* pos) /* we assume tabs are to multiples of 4 characters. */ { const char* s = GetLineStart( start, pos); for ( int i=0, indentation=0; ; ++i) { if ( s[i]==0 || !isspace( s[i])) return indentation; if ( s[i]=='\t') indentation = ( indentation + 5) / 4 * 4; else indentation += 1; } } bool IsFirstNonSpaceOnLine( const char* start, const char* pos) { if ( pos==start) return true; assert( pos > start); for ( --pos; ; --pos) { if ( pos==start || *pos=='\n') return true; if ( !isspace( *pos)) return false; } } } namespace parser { std::string LoadFile( const char* filename) { if ( 0==strcmp( filename, "-")) { //in3 << std::cin.rdbuf(); std::string ret; for(;;) { int c = getchar(); if ( c==EOF) break; ret += c; } return ret; } else { /*`in2 << in.rdbuf()' hangs with VC++'s STL, and takes ages in some implementations of stringstream, so we always do it the long way: */ #if 1 FILE* in2 = fopen( filename, "r"); if ( !in2) throw exception_stream() << "Can't open `" << filename << "' for reading"; std::string ret; for( unsigned int i=0;; ++i) { int c = getc( in2); //if ( (i%100000)==0) std::cerr << i << ": " << c << "\n"; if ( c==EOF) break; /* Have to override std::string's allocator strategy to be like std::vector, otherwise it's painfully slow for large input files. */ if ( ret.capacity() <= i) ret.reserve( ret.capacity() * 2); ret += static_cast< char>( c); } return ret; #else /*std::ifstream in( filename); if ( !in) throw exception_stream() << "Can't open `" << filename << "' for reading"; std::stringstream in2; in2 << in.rdbuf(); return in2.str();*/ #endif } } void lexer_t::Init( const std::string& filename_, const char* text_, bool autoblocks_, bool verbose_, bool show_failed_terminals0, bool show_all_errorpos0, bool show_startpos0, bool no_cplusplus0 ) { debug0 << "show_failed_terminals0=" << show_failed_terminals0 << "\n"; show_failed_terminals = show_failed_terminals0; show_all_errorpos = show_all_errorpos0; show_startpos = show_startpos0; check_indentation = false; autoblocks = autoblocks_; current_indentation = 0; pos = text_; text = text_; end = text_+strlen( text_); nextpos = NULL; filename = filename_; verbose = verbose_; linechar0 = text_; line0 = 1; peeked = NULL; no_cplusplus = no_cplusplus0; this->backtrackpositions.push_back( this->text); } lexer_t::lexer_t( const std::string& filename_, const char* text_, bool autoblocks_, bool verbose_, bool show_failed_terminals0, bool show_all_errorpos0, bool show_startpos0, bool no_cplusplus0 ) : show_failed_terminals( false), show_all_errorpos( false), show_startpos( false), check_indentation( false), autoblocks( false), current_indentation( 0), indentation_nesting(), pos( NULL), text( NULL), end( NULL), text_we_own(), nextpos( NULL), filename(), verbose( false), backtrackpositions(), linechar0( NULL), line0( 0), peeked( NULL), no_cplusplus( false), failed() { Init( filename_, text_, autoblocks_, verbose_, show_failed_terminals0, show_all_errorpos0, show_startpos0, no_cplusplus0); } lexer_t::lexer_t( const std::string& filename_, bool autoblocks_, bool verbose_, bool show_failed_terminals0, bool show_all_errorpos0, bool show_startpos0, bool no_cplusplus0 ) : show_failed_terminals( false), show_all_errorpos( false), show_startpos( false), check_indentation( false), autoblocks( false), current_indentation( 0), indentation_nesting(), pos( NULL), text( NULL), end( NULL), text_we_own( LoadFile( filename_.c_str())), nextpos( NULL), filename(), verbose( false), backtrackpositions(), linechar0( NULL), line0( 0), peeked( NULL), no_cplusplus( false), failed() { Init( filename_, text_we_own.c_str(), autoblocks_, verbose_, show_failed_terminals0, show_all_errorpos0, show_startpos0, no_cplusplus0); } #ifndef NDEBUG void lexer_t::Check() const { assert( this->pos >= this->text); assert( this->pos <= end); assert( this->linechar0 >= this->text); assert( this->linechar0 <= end); assert( this->pos >= this->linechar0); assert( this->nextpos==NULL || (this->nextpos>=this->text && this->nextpos<=this->end)); } #endif int lexer_t::GetLineNumber( const char* p) const { if ( !p) p = this->pos; Check(); int l = line0; for ( const char* c=this->linechar0; cpos; Check(); return p - GetLineStart( this->text, p); } const std::string& lexer_t::GetFilename() const { Check(); return this->filename; } const char* lexer_t::GetLinePos( const char* p) const { if ( !p) p = this->pos; Check(); return GetLineStart( this->text, p); } std::string lexer_t::GetLine( const char* p) const { Check(); const char* s = this->GetLinePos( p); std::string ss; for ( ; *s && *s!='\n'; ++s) ss += *s; //debug << "GetLine returning " << ss << "\n"; return ss; } const char* lexer_t::GetPos() const { Check(); assert( this->pos==this->text || *this->pos==0 || !isspace( *this->pos)); return this->pos; } lexer_t::BracketNesting::BracketNesting( const char* text, const char* pos_, const std::type_info& closetype_) : closetype( &closetype_), pos( pos_), indentation( 0) { this->indentation = GetIndentation( text, pos_); } node_t* peeknext_internal( lexer_t& lexer) { lexer.Check(); assert( !lexer.peeked); /*static Profile::PNamedStatistics profstats( "PeekNext"); Profile::PStatScope prof( profstats);*/ EatWhite( lexer.pos); if ( lexer.autoblocks) { if ( lexer.verbose) { char buffer[5]=""; strncat( buffer, lexer.pos, 4); std::cerr << "Looking at " << buffer << "\n"; } int new_indentation = GetIndentation( lexer.text, lexer.pos); /* we only attempt to invent a '{' or '}' if we are at top level or in a {...} block. So we don't insert '{' or '}' when inside (...) for example. */ if ( new_indentation != lexer.current_indentation && ( lexer.indentation_nesting.empty() || *lexer.indentation_nesting.back().closetype==typeid( keyword_CLOSECURLY)) && ( lexer.pos[0]!='#')) { if ( new_indentation > lexer.current_indentation) { if ( lexer.verbose) std::cerr << "*pos=" << *lexer.pos << "*********inserting `{'\n"; lexer.peeked = new keyword_OPENCURLY; lexer.peeked->set_text( "{"); lexer.current_indentation = new_indentation; } else if ( new_indentation < lexer.current_indentation && !lexer.indentation_nesting.empty() && new_indentation < lexer.indentation_nesting.back().indentation ) { if ( lexer.verbose) std::cerr << "*pos=" << lexer.pos << "**********inserting `}'\n"; lexer.peeked = new keyword_CLOSECURLY; /* Ensure that we have a newline between an inserted '}' and a #define. This spoils the line numbering (should be poss to fix), but won't occur very often.*/ if ( *lexer.pos=='#') lexer.peeked->set_text( "}\n"); else lexer.peeked->set_text( "}"); if ( lexer.indentation_nesting.size()==1) lexer.current_indentation = 0; else { assert( lexer.indentation_nesting.size() >= 2); lexer.current_indentation = lexer.indentation_nesting.end()[-2].indentation; } } else { assert( false); } } } if ( lexer.peeked) { /* from autoblocks code, so for the next lexical item, we need to start looking from the current position. */ lexer.nextpos = lexer.pos; lexer.Check(); } else { const char* itemend = lexer.pos; // modified when item is found, to point to end of item. const char* begin = itemend; if ( *itemend==0) { lexer.peeked = new endoffile_t(); } else if ( begin[0]=='/' && begin[1]=='*') { for (;; ++itemend) { if ( itemend[0]=='*' && itemend[1]=='/') { itemend += 2; break; } if ( !itemend[0]) throw error_t( lexer, "Unmatched C-style comment", true); } lexer.peeked = new c_comment_t; //std::cerr << "Found C comment\n"; } else if ( begin[0]=='/' && begin[1]=='/') { for (;; ++itemend) { if ( itemend[0]=='\n') { ++itemend; break; } if ( !itemend[0]) break; } lexer.peeked = new cpp_comment_t; //std::cerr << "Found C++ comment\n"; } else if ( begin[0]=='"' || begin[0]=='\'' || ( begin[0]=='L' && ( begin[1]=='"' || begin[1]=='\''))) { char quote = begin[0]; if ( quote!='"' && quote!='\'') { ++itemend; quote=begin[1]; } for(;;) { for ( ++itemend; *itemend!=quote; ++itemend) { if ( *itemend==0) throw error_t( lexer, "End of file reached inside string"); if ( *itemend=='\\') ++itemend; } ++itemend; // now points to char after closing quote. assert( itemend[-1] == quote); // look for another string after white space. const char* c=itemend; for ( ; c && isspace( *c); ++c) {} if ( *c==quote) { itemend = c; continue; } else { break; } } lexer.peeked = new string_t;//( begin, end); } else if ( ( lexer.peeked = get_keyword( itemend, lexer.no_cplusplus))) { } /*else if ( ( lexer.peeked = GetAlNumKeyword( itemend, lexer.no_cplusplus))) { } else if ( ( lexer.peeked = GetNonAlNumKeyword( itemend))) { }*/ else if ( IsAl_( *itemend)) { for ( ++itemend; *itemend!=0 && IsAlNum_( *itemend); ++itemend) {} lexer.peeked = new identifier_t; } else if ( StartsWith( itemend, "0x")) { for ( itemend+=2; *itemend!=0 && isxdigit( *itemend); ++itemend) {} for ( ; *itemend!=0 && isalpha( *itemend); ++itemend) {} lexer.peeked = new number_t; } else if ( isdigit( *itemend)) { // look for digits followed by any characters (e.g. 1234ul) for ( ++itemend; *itemend!=0 && isdigit( *itemend); ++itemend) {} for ( ; *itemend!=0 && isalpha( *itemend); ++itemend) {} lexer.peeked = new number_t; } else if ( *begin=='#' && (IsFirstNonSpaceOnLine( lexer.text, begin))) { /* This handles the following two cases: # line [] [anything else] # [] [anything else] The second format is output by the GNU C Preprocessor. See http://gcc.gnu.org/onlinedocs/cpp_1.html#SEC44 */ lexer.Check(); assert( begin <= lexer.end); assert( lexer.pos == begin); std::string line = lexer.GetLine(); //debug << "Looking at preproc line:\n" << line; for ( const char* linestart=begin;;) { itemend = strchr( linestart, '\n'); if ( itemend && itemend>linestart && itemend[-1]=='\\') { // last char on line is back-slash. linestart = itemend + 1; continue; } if (!itemend) itemend = lexer.end; assert( itemend <= lexer.end); EatWhite( itemend); assert( itemend <= lexer.end); break; } /* Make temporary buffer for use by scanf. we're assuming that std::vector's storage is contiguous, which isn't strictly guaranteed (yet). */ std::vector< char> newfilename_buffer( 1+line.size()); char* newfilename = &newfilename_buffer[0]; int n; int linenumber; n = sscanf( line.c_str(), "# line %i \"%s\"", &linenumber, newfilename); if ( !n) n = sscanf( line.c_str(), "# %i \"%s\"", &linenumber, newfilename); //std::cerr << "n=" << n << "\n"; if ( n==0) // not #line - some other preprocessor directive. { hash_t* hash = new hash_t; hash->begin = begin; hash->end = itemend; lexer.peeked = hash; //debug << "Found hash: " << *hash; } else { hash_line_t* hashline = new hash_line_t; hashline->linenumber = linenumber; lexer.line0 = linenumber; //std::cerr << "new linenumber: " << lexer.line0 << "\n"; if ( n>=2) // we read a filename { // replace '\\' by '\' in newfilename const char* from = newfilename; char* to = newfilename; for ( from=newfilename, to=newfilename;; ++from, ++to) { if ( from[0]=='\\' && from[1]=='\\') ++from; *to = *from; if ( *to == 0) break; } lexer.filename = newfilename; hashline->filename = newfilename; // takes copy of string pointed to by newfilename //std::cerr << "new filename: " << lexer.filename << "\n"; } lexer.peeked = hashline; //debug << "Found hashline: " << *lexer.peeked; } lexer.Check(); } else { //throw error_t( *this, "Unrecognised lexer item"); } if ( !lexer.peeked) throw error_t( lexer, "lexer_t error", true); lexer.peeked->begin = begin; lexer.peeked->end = itemend; lexer.nextpos = itemend; lexer.Check(); //debug << "Found " << typeid( *lexer.peeked).name() << ": " << lexer.peeked->end-lexer.peeked->begin << " `" << *lexer.peeked << "'\n"; //std::cerr << "lexer_t now at line " << lexer.GetLineNumber() << "\n"; if (lexer.peeked->end <= lexer.pos && !dynamic_cast< endoffile_t*>( lexer.peeked)) { std::cerr << "lexer.peeked->end-lexer.pos=" << lexer.peeked->end-lexer.pos << "\n"; std::cerr << "lexer.pos-lexer.text=" << lexer.pos-lexer.text << "\n"; throw std::logic_error( "internal_t lexer error: 1. end <= pos\n"); } } if ( lexer.check_indentation || lexer.autoblocks) { if ( false) {} else if ( typeid( *lexer.peeked)==typeid( keyword_OPENCURLY)) lexer.indentation_nesting.push_back( lexer_t::BracketNesting( lexer.text, lexer.pos, typeid( keyword_CLOSECURLY))); else if ( typeid( *lexer.peeked)==typeid( keyword_OPENSQUARE)) lexer.indentation_nesting.push_back( lexer_t::BracketNesting( lexer.text, lexer.pos, typeid( keyword_CLOSESQUARE))); else if ( typeid( *lexer.peeked)==typeid( keyword_OPENROUND)) lexer.indentation_nesting.push_back( lexer_t::BracketNesting( lexer.text, lexer.pos, typeid( keyword_CLOSEROUND))); else if ( typeid( *lexer.peeked)==typeid( keyword_CLOSECURLY) || typeid( *lexer.peeked)==typeid( keyword_CLOSESQUARE) || typeid( *lexer.peeked)==typeid( keyword_CLOSEROUND) ) { if ( lexer.check_indentation) { if ( lexer.indentation_nesting.size()==0 || typeid( *lexer.peeked)!=*lexer.indentation_nesting.back().closetype) { AddPosInfo( std::cerr, lexer, "Unmatched close bracket", true); } int indentation_open = lexer.indentation_nesting.back().indentation; int indentation_close = GetIndentation( lexer.text, lexer.peeked->begin); if ( indentation_open != indentation_close) { std::cerr << "Warning: mismatched indentation of lines containing open/close brackets:\n"; std::stringstream buffer1; buffer1 << "Line containing opening bracket is indented by " << indentation_open; const char* oldpos = lexer.pos; lexer.pos = lexer.indentation_nesting.back().pos; AddPosInfo( std::cerr, lexer, buffer1.str(), true, false); //std::cerr << "\n"; lexer.pos = oldpos; std::stringstream buffer2; buffer2 << "Line containing closing bracket is indented by " << indentation_close; AddPosInfo( std::cerr, lexer, buffer2.str(), true, false); //std::cerr << "\n"; } } if ( lexer.indentation_nesting.size()>0) lexer.indentation_nesting.pop_back(); } } lexer.Check(); return lexer.peeked; } node_t* lexer_t::PeekNext() { //Check(); if ( this->peeked) return this->peeked; node_t* ret = peeknext_internal( *this); return ret; } node_t* lexer_t::GetNext() { Check(); /*if ( this->verbose) { static clock_t t = 0; if ( clock()-t > 4*CLOCKS_PER_SEC) std::cerr << "lexer_t pos is " << this->GetFilename() << ": " << this->GetLineNumber() << "\n"; t = clock(); }*/ node_t* node = this->PeekNext(); if ( this->verbose) { debug0 << "lexer_t read token " << typeid( *node).name() << "`" << node->output_compact_tostring() << "', end-begin=" << node->end - node->begin << "\n"; } this->peeked = NULL; if ( node->end <= this->pos && !dynamic_cast< const endoffile_t*>( node)) { if ( autoblocks && ( typeid( *node)==typeid( keyword_OPENCURLY) || typeid( *node)==typeid( keyword_CLOSECURLY) ) ) { // this code could be an extra one inserted by the autoblocks code, // in which case the node pos is allowed to be outside of the input text. } else { std::cerr << typeid( *node).name() << ": " << *node << "\n"; throw std::logic_error( "internal_t lexer error: 2. end <= pos\n"); } } Check(); this->pos = this->nextpos; Check(); EatWhite( this->pos); Check(); if ( hash_line_t* hashline = dynamic_cast< hash_line_t*>( node)) { this->linechar0 = hashline->end; } else if ( simplenode_t* simplenode=dynamic_cast< simplenode_t*>( node)) { // add all trailing #... lines or comments to end as hidden node. // Note that gcc -O3 seems to generate incorrect code here, so //std::cerr << "Found simple node - looking for trailing #line/comments. this->peeked=" << ((void*)this->peeked) << "\n"; //std::cerr << "this->pos=" << ((void*)this->pos) << ", node->end=" << ((void*)node->end) << "\n"; //std::cerr << "Next node is " << ((void*) next) << ", type " << typeid( *node).name() << "\n"; for ( comment_or_hash_t* commentorhash = dynamic_cast< comment_or_hash_t*>( this->PeekNext()); commentorhash; commentorhash = dynamic_cast< comment_or_hash_t*>( this->PeekNext()) ) { //debug << "Found trailing coment/hash: " << *commentorhash << "\n"; if ( !simplenode->extra) { simplenode->extra = new misc_t; simplenode->extra->begin = commentorhash->begin; } simplenode->extra->end = commentorhash->end; //std::cerr << "found trailing # or comment: " << *simplenode_t->extra; this->peeked = NULL; //this->pos = commentorhash->end; Check(); this->pos = this->nextpos; Check(); EatWhite( this->pos); Check(); if ( hash_line_t* trailing_hashline = dynamic_cast< hash_line_t*>( commentorhash)) { this->linechar0 = trailing_hashline->end; } } if ( simplenode->extra) { //debug << "FOund extra " << (this->pos-node->end) << ", total node is: `" << *node << "'"; } } //debug << "lexer_t returning token `" << *node << "'\n"; //debug << *node; //std::cerr << "Line " << GetLineNumber() << "\n"; Check(); //debug << "lexer_t::GetNext() returning " << *node << "\n"; //debug << "End\n"; return node; } void lexer_t::Skip( const node_t* node) { (void) node; // unused in non-debug builds. Check(); assert( this->peeked); assert( node == this->peeked); this->GetNext(); } void lexer_t::AddFailed( const std::type_info& ti, const char* description) { Check(); if ( !this->show_failed_terminals) return; if ( !this->show_all_errorpos) { if ( !this->failed.empty()) { //assert( this->failed.size()==1); if ( this->pos < this->failed.begin()->pos) return; // we only keep one failed pos if !this->detailederrors if ( !this->show_failed_terminals || this->pos > this->failed.begin()->pos) this->failed.clear(); } } //std::cerr << "Adding failure " << description << " at " << (this->pos-this->text) << "\n"; assert ( !this->backtrackpositions.empty()); this->failed.insert( FailedInfo( ti, description, this->pos, this->backtrackpositions.back())); Check(); } void lexer_t::ClearFailed() { this->failed.clear(); } lexer_mark_t::lexer_mark_t( lexer_t& lexer_) : lexer( lexer_), //lexer0( lexer_), have_backtracked( false), show_failed_terminals( lexer_.show_failed_terminals), show_all_errorpos( lexer_.show_all_errorpos), show_startpos( lexer_.show_startpos), check_indentation( lexer_.check_indentation), autoblocks( lexer_.autoblocks), current_indentation( lexer_.current_indentation), indentation_nesting( lexer_.indentation_nesting), pos( lexer_.pos), text( lexer_.text), end( lexer_.end), nextpos( lexer_.nextpos), verbose( lexer_.verbose), backtrackpositions( lexer_.backtrackpositions), linechar0( lexer_.linechar0), line0( lexer_.line0), peeked( lexer_.peeked), no_cplusplus( lexer_.no_cplusplus) { lexer.backtrackpositions.push_back( lexer.GetPos()); } void lexer_mark_t::Backtrack() { /* We allow multiple calls to this function. used by the parser e.g. in TryGetFnParams(). otherwise we could do: assert( !this->have_backtracked) */ this->have_backtracked = true; // copy all except the failed terminals. //lexer_t::FailedTerminals failedterminals = this->lexer.GetFailedTerminals(); //this->lexer = this->lexer0; this->lexer.show_failed_terminals = this->show_failed_terminals; this->lexer.show_all_errorpos = this->show_all_errorpos; this->lexer.show_startpos = this->show_startpos; this->lexer.check_indentation = this->check_indentation; this->lexer.autoblocks = this->autoblocks; this->lexer.current_indentation = this->current_indentation; this->lexer.indentation_nesting = this->indentation_nesting; this->lexer.pos = this->pos; this->lexer.text = this->text; this->lexer.end = this->end; this->lexer.nextpos = this->nextpos; this->lexer.verbose = this->verbose; this->lexer.backtrackpositions = this->backtrackpositions; this->lexer.linechar0 = this->linechar0; this->lexer.line0 = this->line0; this->lexer.peeked = this->peeked; this->lexer.no_cplusplus = this->no_cplusplus; //this->lexer.failed = failedterminals; debug0 << "Backtracked to " << this->lexer.pos[0] << this->lexer.pos[1] << this->lexer.pos[2] << this->lexer.pos[3] << " peeked=" << *this->lexer.peeked << "\n"; } lexer_mark_t::~lexer_mark_t() { if ( !have_backtracked) { //assert( lexer.backtrackpositions.back() == this->lexer0.GetPos()); this->lexer.backtrackpositions.pop_back(); /* Used to have this code to remove knowledge of all failed terminals after the current position. But these failed terminals are actually what the failedterminal system is all about - giving maximum info about what happened in a failed parse.*/ /*for ( lexer_t::FailedTerminals::iterator it=this->lexer.failed.begin(); it!=this->lexer.failed.end();) { if ( it->pos > this->lexer0.pos) { lexer_t::FailedTerminals::iterator next = it; ++next; this->lexer.failed.erase( it); it = next; } else ++it; }*/ /* The following failes to compile with g++ 2.95.2 STLport 4.1b2 - remove_if seems to return const_iterator? */ /*IfLater iflater( this->lexer0.pos); this->lexer.failed.erase( std::remove_if( this->lexer.failed.begin(), this->lexer.failed.end(), iflater ), this->lexer.failed.end() );*/ } } }