【记录】将antlr v2的C/C++的preprocess，即cpp.g，转换为antlr v3

【背景】

需要用antlr实现C语言的预处理：

include，define等等内容。

参考了：

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

已经实现了部分的事情。

代码如下：

grammar preprocess;
//lexer grammar preprocess;

options{
	language=Java;
	output = AST;
}

@lexer::header {
//package com.mm.antlrv3demo;

import java.io.*;
import java.util.*;
}

@parser::header {
//package com.mm.antlrv3demo;
}

@lexer::members {
    //public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    /*
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
    */
	
	class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
 
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
 
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
 
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
 
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
 
       return token;
     }
}

COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;

// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
 
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();

    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
/*
fragment
NON_CR_LF	:	~('\r'|'\n');

fragment
TAB_SPACE
	:	(' ' | '\t');
*/

//DIRECTIVE 	:	('#define' WS* defineMacro=ID WS* defineText=STRING)
//DIRECTIVE 	:	('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) )


fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;

DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
} 	:	('#define' WS* defineMacro=RAW_IDENTIFIER
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n'
    {
     	defines.put( defineMacro.getText(), args );
        skip();
    }
    );

IDENTIFIER @init{
	List define = new ArrayList();
	List args = new ArrayList();
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1=EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{
if (define!=null) {
	String defineText = (String)define.get(0);

	// create a new lexer to handle the macro text
	preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText)));
	for (int i=0;i<args.size();++i) {
		// treat macro arguments similar to local defines
		List arg = new ArrayList();
		arg.add((String)args.get(i));
		sublexer.defineArgs.put( (String)define.get(1+i), arg );
	}
	selector.push(sublexer);
	// retry in new lexer
	selector.retry();

}
};

fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;

NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long)

// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;


fragment EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;

//INT :	'0'..'9'+    ;

FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;

WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;

//RestSymbo	:	'{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;

STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;

CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;

fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;

fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;

fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;

fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;

fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
    
header
	:	include*;
include	:	INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

但是还是遇到很多问题，其中主要就是，针对于旧的antlr v2的TokenStreamSelector，如何换用成antlr v3的逻辑，用哪些函数和类替代。

【折腾过程】

1.关于预处理的问题，这人：

[antlr-interest] C PreProcessor Errors

也遇到类似的事情，但是对此处没啥帮助。

2.这里：

[antlr-interest] ANTLR 3 migration: TokenStreamSelector

和：

[antlr-interest] TokenStreamSelector + ANTLRv3

也提到了，v2转v3时，如何处理TokenStreamSelector，但是没人回答。

3.这里：

Tips on designing a preprocessor for C++ using Antlr

关于预处理，已经解释的很全了，但是还是antlr v2的版本，还是不能完全透彻的理解，还是无法找到TokenStreamSelector的替代品。

4.google搜：

antlr TokenStream Selector deprecated

看到了“Token Stream Multiplexing”，所以，去找找antlr作者写的书

The Definitive ANTLR Reference.pdf

看看其中关于此部分的解释，或许可以找到有价值的参考资料。

5.另外，顺便提示一句，上述代码中的那个：

testLiterals

实际上是antlr v2的语法

根据：

Migrating from ANTLR 2 to ANTLR 3

的某人评论，得知此testLiterals，antlr v3中也没了。

6.参考：

[antlr-interest] v3 – How to deal with include Files?

也讨论了类似问题，但是还是无解。

7.自己看代码，有一点点眉目了：

（1）antlr v2中的处理新的lexer（和tokenStream）的逻辑

    public static TokenStreamSelector selector; // must be assigned externally
    protected static Map defines = new Hashtable(); // holds the defines
    
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        }
        ......
    }
    

: '#'
    ( "include" (WS)? includeFile:STRING {
            ......
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            }
            ......
        }
    }
......

    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
        
        ......
        
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};

即，主要是：

用new cppLexer新建一个sublexer，

然后初始化一堆东西，比如：

给对应的给全局变量defines去赋值等等

然后就转到新的sublexer去处理了，调用方法是：

先push

再retry

而后，对于新的lexer，都有对应的uponEOF，

其中目的是遇到了EOF，要返回之前的（父级的lexer，所以

先去pop（返回到上一级，父级的lexer）

再去retry（相当于刷新，去使用当前的，父级的lexer）

（2）而与此相对应的，目前已经实现了，antlr v3的，处理新的lexer（和tokenStream）的代码是：

     Stack<SaveStruct> includes = new Stack<SaveStruct>();
 
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       ......
 
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         
         ......
       }
 
      ......
     }

// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    ......
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
 
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();

    }
    ......
};

逻辑是：

也是，对于遇到了要include的文件，

类似于新的lexer

然后先去新建一个，全局的那个SaveStruct

将其保存起来，即push，即压栈

然后使用当前新的CharStream

然后用reset，使得回到文件最开始处，再重新处理

这样，就是：

先保存了旧的，父级的lexer（tokenStream）

然后用当前child级别的lexer去处理新的内容

处理完成后，即遇到了EOF

然后会在上面的nextToken中遇到

会去对于全局的变量includes，去pop，拿出来，之前保存的父级的lexer

然后通过setCharStream把后续要处理的内容拿出来

再通过input.rewind，定位到之前记录的位置，

就可以继续去处理了。

以此实现了递归的调用。

而基本明白了递归调用，递归处理父级和子级的lexer或tokenSteam，CharStream的逻辑后，

接下来，就可以，参考两者的不同之处，找到antlr v3中，如何去模拟此套逻辑了。

8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程，参见：

【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程

搞懂逻辑后，接下来，才是，如何将其转化为antlr v3版本的代码，实现同样的逻辑。

9.暂时写了如下代码：

fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\n')))*;
MACRO_TEXT :    (('\\' '\n') | (~('\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;


DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
    
    String arg0Text = "";
    String arg1Text = "";
    String definedContent = "";
    String defineId = "";
    
} 	:	('#define' WS* defineMacro=RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT 
            {
                definedContent = defineText.getText();
                args.set(0, definedContent);
            }
        )? '\n'
    {
    	defineId = defineMacro.getText();
     	defines.put(defineId, args );
        skip();
    }
    );

IDENTIFIER @init{
	List define = new ArrayList();
	List foundArgs = new ArrayList();
    
    String callArg0Text = "";
    String callArg1Text = "";
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { !((define!=null) && (define.size()>1)) }? 
    |
    { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR
        {
            callArg0Text = callArg0.getText(); 
            foundArgs.add(callArg0Text);
        }
        ( COMMA callArg1=EXPR 
        {
            callArg1Text = callArg1.getText();
            foundArgs.add(callArg1Text);
        }
        )*
        { foundArgs.size()==define.size()-1 }? // better have right amount
        ')'
    )
{
if (define!=null) {
	String defineText = (String)define.get(0);
    
    if (define.size()==1) {
        //only have one value in list -> the defineText is the define para content -> just need replace directly
        setText(defineText);
    } else {
        //add new dict pair: (para, call value)
        for (int i=0;i<foundArgs.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)foundArgs.get(i));
            defineArgs.put( (String)define.get(1+i), arg );
        }
        
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);

        // switch on new input stream
        setCharStream(new ANTLRStringStream(defineText));
        reset();
    }
}
};

但是还没成功，且遇到一个问题：

【未解决】antlr v3的lexer的条件性匹配

10.

转载请注明：在路上 » 【记录】将antlr v2的C/C++的preprocess，即cpp.g，转换为antlr v3

Post Views: 1,878

与本文相关的文章