[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

thrutchy eric_mahurin at yahoo.com
Thu Jul 22 13:42:05 PDT 2004


I just completed a go at cpp (C PreProcessor) using an ANTLR lexer. 
It could be integrated with an existing lexer.  It handles all of the
hard stuff: #define/#undef, macro calls, #ifdef/#ifndef/#else/#elsif,
and #include.  I know it doesn't follow the cpp spec to a T, but it
has all of the major features.  I'm doing a preprocessor similar to
cpp and thought I'd make a cpp let you guys have this.

This was more difficult than I expected, specially since this is my
first experience with Java (and I just learned C++ too).  I welcome
input to improve this.  I didn't see how to get rid of the "deprecated
API" and use StringReader.

Anyways, here it is (sorry if the lines get split):

// Author: Eric Mahurin
// License: just give me credit

options {
    language="Java";
}

{

import java.io.*;
import java.util.*;
import antlr.*;

class cpp implements cppLexerTokenTypes {
    public static TokenStreamSelector selector = new
TokenStreamSelector();
    public static void main(String[] args) {
        try {
            // will need a stack of lexers for #include and macro calls
            cppLexer mainLexer = new cppLexer(new
DataInputStream(System.in));
            mainLexer.selector = selector;
            selector.select(mainLexer);
            for (;;) {
                Token t = selector.nextToken();
                if (t.getType()==Token.EOF_TYPE) break;
                System.out.print(t.getText());
            }
        } catch(Exception e) {
            System.err.println("exception: "+e);
        }
    }
}

}

class cppLexer extends Lexer;

options {
    testLiterals = false;
    k = 4;
}

tokens {
    ENDIF ;
}

{
    public static TokenStreamSelector selector; // must be assigned
externally
    protected static Integer ifState = 1; // -1: no-else false, 0:
false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested
if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for
a macro call
    public void uponEOF() throws TokenStreamException,
CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
}

DIRECTIVE {
    List args = new ArrayList();
    boolean condition = true;
} : '#'
    ( "include" (WS)? includeFile:STRING { if (ifState==1) {
        // found this in examples/java/includeFile
        String name = includeFile.getText();
        name = name.substring(1,name.length()-1);
        try {
            cppLexer sublexer = new cppLexer(new DataInputStream(new
FileInputStream(name)));
            sublexer.defines = defines; // want defines to be persistent
            sublexer.setFilename(name);
            selector.push(sublexer);
            selector.retry();
        } catch (FileNotFoundException fnf) {
            System.err.println("cannot find file "+name);
        }
    }}
    | "define" WS defineMacro:RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before
left paren)
                (WS)? defineArg0:RAW_IDENTIFIER (WS)?
{args.add(defineArg0.getText());}
                ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)?
{args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText:MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n' {newline();}
    { if (ifState==1) {
        defines.put( defineMacro.getText(), args );
        $setType(Token.SKIP);
    }}
    | "undef" WS undefMacro:RAW_IDENTIFIER { if (ifState==1) {
        defines.remove(undefMacro.getText());
        $setType(Token.SKIP);
    }}
    | ("ifdef"|"ifndef"{condition=false;})
        WS ifMacro:RAW_IDENTIFIER
    {
        ifStates.add(ifState);
        if (ifState==1) {
            condition =
(defines.containsKey(ifMacro.getText())==condition);
            ifState = condition?1:0;
        } else {
            ifState = -1;
        }
        if (ifState==1) {
            $setType(Token.SKIP);
        } else {
            // gobble up tokens until ENDIF (could be caused by else)
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        }
    }
    |
        ( "else" // treat like elsif (true)
        | "elsif" WS elsifMacro:RAW_IDENTIFIER {
            condition=defines.containsKey(elsifMacro.getText());
        }
        )
    {
        if (ifState==1) {
            // previous if/elsif was taken - discard rest
            ifState = -1;
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        } else if (ifState==0 && condition) {
            // "elsif" (true) or "else"
            $setType(ENDIF);
            ifState = 1;
        }
    }
    | "endif" {
        condition = (ifState==1);
        try {
            // return to previous if state
            ifState = (Integer)ifStates.remove(ifStates.size()-1);
            if (condition) {
                $setType(Token.SKIP);
            } else {
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // endif with no if
        }
    }
    );

IDENTIFIER options {testLiterals=true;} {
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (_createToken && define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1:EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{ if (define!=null) {
    String defineText = (String)define.get(0);
    if (!_createToken) {
        // just substitute text if called from EXPR - no token created
        $setText(defineText);
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new
StringBufferInputStream(defineText)));
        for (int i=0;i<args.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)args.get(i));
            sublexer.defineArgs.put( (String)define.get(1+i), arg );
        }
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};

STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;

protected MACRO_TEXT :
    ( '\\'! '\n' {newline();} // escaped newline
    | ~'\n'
    )*;


WS :
    ( ' '
    | '\t'
    | '\f'
    | '\n' {newline();}
    ) { /*$setType(Token.SKIP);*/ };

COMMENT :
    ( "//" (~'\n')* '\n' {newline();} // single line comment
    | "/*" ( options{greedy=false;} : '\n' {newline();} | ~('\n') )*
"*/" // multi-line comment
    ) { /*$setType(Token.SKIP);*/ };

protected RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_')
('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;

NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha
suffixes on numbers (i.e. L:long)

// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' |
':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;

protected EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;





 
Yahoo! Groups Links

<*> To visit your group on the web, go to:
    http://groups.yahoo.com/group/antlr-interest/

<*> To unsubscribe from this group, send an email to:
    antlr-interest-unsubscribe at yahoogroups.com

<*> Your use of Yahoo! Groups is subject to:
    http://docs.yahoo.com/info/terms/
 



More information about the antlr-interest mailing list