1.1 - RECOGNIZING SOME TAGS IN THE HEADER PART //////////////////////////////// lex1.1a.l ///////////////////////////////// X "" { printf("[B1=%s]",yytext); } . { // markedly show all what was unforeseen printf("[%s!!]",yytext); } %% /////////////////////////////// 1.1a.datOK ///////////////////////////////// /////////////////////////////// 1.1a.datBAD ///////////////////////////////// //////////////////////////////// lex1.1b.l ///////////////////////////////// X "" { printf("[B2=%s]",yytext);} . { printf("[%s!!]",yytext); } %% /////////////////////////////// 1.1b.datOK ///////////////////////////////// /////////////////////////////// 1.1b.datBAD ///////////////////////////////// //////////////////////////////// lex1.1c.l ///////////////////////////////// DT "" { printf("[B3=%s]",yytext); } . { printf("[%s!!]",yytext); } %% /////////////////////////////// 1.1c.datOK ///////////////////////////////// /////////////////////////////// 1.1c.datBAD ///////////////////////////////// 1.2 - RECOGNIZING THE MAIN TAGS IN THE BODY OF AN Xml FILE a) RECOGNITION ONLY ///////////////////////////////// lex1.2a.l /////////////////////////////////// OpTag "<"[^>]+">" ClTag "]+">" STag "<"[^>]+" />" %% {STag} { printf("[S_TAG=%s]", yytext); } {ClTag} { printf("[CL_TAG=%s]", yytext); } {OpTag} { printf("[OP_TAG=%s]", yytext); } %% ///////////////////////////// lex1.2a.dat ///////////////////////////////////// user.mod.news /////////////////////////////////////////////////////////////////////////////// Remarks - here we choose not to show all other chars, but just to echo them. A more cautious possibility would be to echo spaces and newlines, and to show all other chars very markedly (this quickly gets boring, but it's for debug): [ \n] ECHO; . printf("[%c!]", yytext[0]); - we assume that in simple tags the "/>"end must be preceded with a space - using expressions as [^>] makes the specification more general, the analyzer won't be tied to accept a restricted char set in the tag body - the rules have to be specified in some suitable order, due to ambiguities - we might be tempted to use the [^/] subexpression for defining STag, but this would forbid '/' chars inside the tag body. Or to use [^/>] but then the rule for the S tag could take an opening tag as a simple tag, and the rule order would be even more crucial b) EXTRACTING THE TAG NAME ///////////////////////////////// lex1.2b.l /////////////////////////////////// %{ #include // Locate a separator-delimited "field" in the recognized fragment yytext, copy // it into a dedicated buffer (yytext will be overwritten by next fragment) char * getFieldFrom( int beg ) { int i = beg, c; char * s; while( (c=yytext[i]) != '\0' && c!='/' && c != '>' && c != ' ' ) i++; if( (s=malloc(i-beg+1))==NULL ) { printf("malloc FAILED !");exit(1);} strncpy( s, &yytext[beg], i-beg ); s[i-beg] = 0; return s; } %} OpTag "<"[^>]+">" ClTag "]+">" STag "<"[^>]+" />" %% {STag} { printf("[S_TAG name=%s]", getFieldFrom( 1 ) ); } {ClTag} { printf("[CL_TAG name=%s]", getFieldFrom( 2 ) ); } {OpTag} { printf("[OP_TAG name=%s]", getFieldFrom( 1 ) ); } %% c) VARIANT (bonus) Since lexical analysis may gets tricky, it's a good idea to proceed with very gradual macro definitions. (Note that, for several notions here, the correct place to recognize them could be the syntaxic level). Symb [^ />\n]+ Path {Symb}("/"{Symb})* Pair {Symb}"="({Symb}|{Path}) Field {Pair}|{Symb}|{Path} S1Tag "<"({Field}" ")*"/>" %% {S1Tag} { printf("[S1Tag=%s]", yytext); } %% /////////////// lex1.2c.dat = variant from lex1.2a.dat //////////////////////// 2.1 - TREE GRAMMARS : RECOGNIZING THE STRUCTURE ONLY In this question, the OP, CL, S tags have no associated names. 2.1a) Entry text presented on a only line To nevertheless deal with a final \n, it is convenient to define a notion of Text. The tree structure is encoded with a Block, and it is followed with a \n. ///////////////////////////// 2.1a.dat ////////////////////////////////////// OP OP S S CL OP S CL S CL ///////////////////////////// gram2.1a.y ////////////////////////////////////// %token OP CL S '\n' %% Text : Block '\n' { printf("<>"); } ; Block : OP List CL { printf("<>"); } ; List : { printf("<>"); } | List S { printf("<>"); } | List Block { printf("<>"); } ; %% /////////////////////////////////// lex2.1a.l ///////////////////////////////// %{ #include "y.tab.c" %} %% OP { printf("[OP]"); return OP; } CL { printf("[CL]"); return CL; } S { printf("[S]"); return S; } \n return '\n'; [ \t] ECHO; . { printf("[%c!!]",yytext[0]); // Clearly spot the error return yytext[0]; // Make the syntaxic analyzer to err } %% 2.1b) Entry text is presented on several lines Here we have to admit a \n after any tag (P, CL or S). A possible grammar is as follows. The Text notion no longer is useful. The lexical analyzer is the same. ////////////////////////// 2.1b.dat ////////////////////////////////////// OP OP S S CL OP S CL S CL ////////////////////////// gram2.1b.y ///////////////////////////////////////// %token OP CL S '\n' %{ #include "lex.yy.c" %} %% Block : OP '\n' List CL '\n' { printf("<>"); } ; List : { printf("<>"); } | List S '\n' { printf("<>"); } | List Block { printf("<>"); } ; %% 2.2 - TREE GRAMMARS : RECOGNIZING A NAMED TREE Now the OP, CL and S tags have an associated name. At the lexical level, getNameFrom() gets this name. This function is nearly the same as Q1.2b's getFieldFrom(), it just accounts for less delimiters. To pass a name to the syntaxic level, we define types associated to the (union) yylval variable. (In view of Q3, we define also an type for the Block and List notions). The values associated to lexical categories OP, CL, S are given a type. The action for the Block rule checks the values associated to OP and CL tokens. /////////////////////////////// 2.2.dat ////////////////////////////////////// OP1 OP2 S3 S4 CL2 OP5 S6 CL5 S7 CL1 /////////////////////////////// gram2.2.y //////////////////////////////////// %token OP CL S %token '\n' %type List Block %union { char * name; void * any; } %% Block : OP '\n' List CL '\n' { printf("<>\n",$1,$4); if( strcmp( $1, $4 ) ) { printf("UNMATCHED NAMES!\n"); exit(1); } } ; List : { printf("<>"); } | List S '\n' { printf("<>\n"); } | List Block ; %% /////////////////////////////// lex2.2.l ///////////////////////////////////// %{ #include "y.tab.c" char * getNameFrom( int beg ) { int i = beg, c; char *s; while( (c=yytext[i]) != '\0' && c != '\n' && c != ' ' && c != '\t' ) i++; s = malloc( i-beg+1 ); // one more for \0 if( s == NULL ) { printf("malloc() FAILED !!"); exit(1); } strncpy( s, &yytext[beg], i-beg ); return s; } void lexTrace( ) { printf("[%s]",yytext); } %} %% OP[^ \t\n]+ { lexTrace( ); yylval.name=getNameFrom(2); return OP; } CL[^ \t\n]+ { lexTrace( ); yylval.name=getNameFrom(2); return CL; } S[^ \t\n]+ { lexTrace( ); yylval.name=getNameFrom(1); return S; } \n return '\n'; [ \t] ECHO; . { printf("[%c!!]",yytext[0]); return yytext[0]; } %% 3.1 - RECOGNIZING A NAMED TREE - the syntaxic analyzer is nearly as gram2.2.y . here we just deal with some more cases with respect to newlines, to accept a List, and an OP/CL block when defined on a only line. . note that gram1.2.y already cared to assign types to Block and List, in view of the subsequent tree building - the lexical analyzer derives very simply from lex2.1b.l : . includes y.tab.c, pass the names via yylval, return the tokens, including \n . for simplicity, it skips any unforeseen character (note that in an industrial context this should be improved) ////////////////////////////// gram3.1.y ////////////////////////////////////// %token OP CL S %token '\n' %type List Block %union { char * name; void * any; } %% Block : OP '\n' List CL '\n' { printf("<>\n",$1,$4); if( strcmp( $1, $4 ) ) { printf("UNMATCHED NAMES!\n"); exit(1); } } | OP List CL '\n' { printf("<>\n",$1,$3); if( strcmp( $1, $3 ) ) { printf("UNMATCHED NAMES!\n"); exit(1); } } ; List : { printf("<>"); } | List S '\n' { printf("<>\n"); } | List S { printf("<>"); } | List Block { printf("<>\n"); } ; %% ////////////////////////////////// lex3.1.l ////////////////////////////////// %{ #include #include "y.tab.c" char * getFieldFrom( int beg ) { int i = beg, c; char * s; while( (c=yytext[i]) != '\0' && c!='/' && c != '>' && c != ' ' ) i++; if( (s=malloc(i-beg+1))==NULL ) { printf("malloc FAILED !");exit(1);} strncpy( s, &yytext[beg], i-beg ); s[i-beg] = 0; return s; } %} OpTag "<"[^>]+">" ClTag "]+">" STag "<"[^>]+" />" %% {STag} { yylval.name = getFieldFrom(1); return S; } {ClTag} { yylval.name = getFieldFrom(2); return CL; } {OpTag} { yylval.name = getFieldFrom(1); return OP; } \n return '\n'; [ \t] ECHO; . ; %% ////////////////////////////////// 3.1.dat //////////////////////////////////// user.mod.news 3.2 - BUILDING THE TREE STRUCTURE - the syntaxic part gram3.2.y derives from gram3.1.y as follows: . we include the lists.c file to manage Nodes, Lists, Blocks . a Block variable remembers the root Block, for printing the resulting tree . other auxiliary pointer variables are defined once for all . actions for the "Block" rules consists in creating one Block . actions associated to the List rules respectively: create a new List, add a "simple" node to it, or add a "node with a block" - the lexical analyzer lex3.2.l is identical to lex3.1.l - in the "prin.c" file, just yyparse() is modified to print the tree /////////////////////////////// gram3.2.y ///////////////////////////////// %token OP CL S %token '\n' %type List Block %union { char * name; void * any; } %{ #include #include "lists.c" Block * root, * bptr; List * lptr; Node * nptr; int level = 0; %} %% Block : OP '\n' List CL '\n' { if( strcmp( $1, $4 ) ) { printf("UNMATCHED NAMES!\n"); exit(1); } else { bptr = newBlock( $1, $3, --level ); if( level==0 ) { root = bptr; printf("ROOT "); } $$ = bptr; } } | OP List CL '\n' { printf("<>\n",$1,$3); if( strcmp( $1, $3 ) ) { printf("UNMATCHED NAMES!\n"); exit(1); } else { bptr = newBlock( $1, $2, --level ); if( level==0 ) { root = bptr; printf("ROOT "); } $$ = bptr; } } ; List : { lptr = newList( ); printf("",lptr,level); level++; $$ = lptr; } | List S '\n' { nptr = newSimpleNode($2); appendNodeToList( nptr, $1 ); printf("\n",$1, $2); $$ = $1; // current list remains the same } | List S { nptr = newSimpleNode($2); appendNodeToList( nptr, $1 ); printf("\n",$1, $2); $$ = $1; } | List Block { nptr = newNodeWithABlock($2); appendNodeToList( nptr, $1 ); printf("\n",$1,nptr->bptr->name); $$ = $1; } ; %% /////////////////// prin.c : only yywrap() modified /////////////////////// #include #include "lex.yy.c" main() { yyparse(); } yyerror(char *s) { printf("<%s!!>",s); } yywrap() { printf("\nTREE LISTING :\n"); blockList( root ); return 1; }