在上一篇文章中已经介绍了读文件的操作,那么这一篇文章中将会细致解释词法分析。
在源文件里解析出的单词流必须识别为保留字,标识符,常量,操作符和界符五大类
1.显然我们须要列举出全部的保留字,而这里与保留字相似的那么就是标识符,在C语言中,保留字都是以小写字母开头,并且当中的字母仅仅能是小写字母,而标识符的第一个字母则必须为字符(小写大写皆可)后面能够接大写和小写字母和字符 ‘_’, 在我写的这个编译器中,标识符不能超过100,在C语言中的标识符定义的长度大小远远大于此。
2.对于常量,这里须要注意的是整型和浮点型常量。
3.运算符依照的是以下的表:
C语言运算符表
运算符依照优先级大小由上向下排列,在同一行的运算符具有同样优先级。第二行是全部的一元运算符。
运算符 | 解释 | 结合方式 |
() [] -> . | 括号(函数等),数组,两种结构成员訪问 | 由左向右 |
! ~ ++ -- + - * & | 否定,按位否定,增量,减量,正负号, 间接,取地址 | 由右向左 |
* / % | 乘,除,取模 | 由左向右 |
+ - | 加,减 | 由左向右 |
<< >> | 左移,右移 | 由左向右 |
< <= >= > | 小于,小于等于,大于等于,大于 | 由左向右 |
== != | 等于,不等于 | 由左向右 |
& | 按位与 | 由左向右 |
^ | 按位异或 | 由左向右 |
| | 按位或 | 由左向右 |
&& | 逻辑与 | 由左向右 |
|| | 逻辑或 | 由左向右 |
? : | 条件 | 由右向左 |
= += -= *= /= &= ^= |= <<= >>= | 各种赋值 | 由右向左 |
, | 逗号(顺序) | 由左向右 |
4.界符:“;”“{}”,单引號,双引號
接下来我介绍的是对保留字的归类,为了查找方便,将保留字依照a-z的顺序排好,根据数组的下标定位,降低寻找的时间
/*
* keyword.h
*
* Created on: Jun 12, 2014
*
*/ #ifndef KEYWORD_H_
#define KEYWORD_H_ struct keyword{
char *keyName;
}; static struct keyword key__[]={
{"__int64"},
{"end"}
}; static struct keyword key_A[]={
{"auto"},
{"end"}
};
static struct keyword key_B[]={
{"break"},
{"end"}
};
static struct keyword key_C[]={
{"case"},
{"char"},
{"const"},
{"continue"},
{"end"}
};
static struct keyword key_D[]={
{"default"},
{"do"},
{"double"},
{"end"}
};
static struct keyword key_E[]={
{"else"},
{"enum"},
{"extern"},
{"end"}
};
static struct keyword key_F[]={
{"float"},
{"for"},
{"end"}
};
static struct keyword key_G[]={
{"goto"},
{"end"}
};
static struct keyword key_H[]={
{"end"}
};
static struct keyword key_I[]={
{"if"},
{"int"},
{"end"}
};
static struct keyword key_J[]={
{"end"}
};
static struct keyword key_K[]={
{"end"}
};
static struct keyword key_L[]={
{"long"},
{"end"}
};
static struct keyword key_M[]={
{"end"}
};
static struct keyword key_N[]={
{"end"}
};
static struct keyword key_O[]={
{"end"}
};
static struct keyword key_P[]={
{"end"}
};
static struct keyword key_Q[]={
{"end"}
};
static struct keyword key_R[]={
{"register"},
{"return"},
{"end"}
};
static struct keyword key_S[]={
{"short"},
{"signed"},
{"sizeof"},
{"static"},
{"struct"},
{"switch"},
{"end"}
};
static struct keyword key_T[]={
{"typedef"},
{"end"}
};
static struct keyword key_U[]={
{"union"},
{"unsigned"},
{"end"}
};
static struct keyword key_V[]={
{"void"},
{"volatile"},
{"end"}
};
static struct keyword key_W[]={
{"while"},
{"end"}
};
static struct keyword key_X[]={
{"end"}
};
static struct keyword key_Y[]={
{"end"}
};
static struct keyword key_Z[]={
{"end"}
};
// size is 27
static struct keyword *keywords[]={
key__,key_A,key_B,key_C,key_D,key_E,
key_F,key_G,key_H,key_I,key_J,key_K,
key_L,key_M,key_N,key_O,key_P,key_Q,
key_R,key_S,key_T,key_U,key_V,key_W,
key_X,key_Y,key_Z
}; #endif /* KEYWORD_H_ */
以下是词法分析的源代码;
/*
* lex.h
*
* Created on: Jun 13, 2014
*
*/
#include "input.h"
#include "keyword.h" #define isDigit(c) (c>='0' && c<='9')
#define isUpperLetter(c) (c>='A' && c <='Z')
#define isLowerLetter(c) (c>='a' && c<='z')
#define isLetter(c) (isUpperLetter || isLowerLetter)
/*
* lex.c
*
* Created on: Jun 13, 2014
*
*/
#include "zcc.h"
#include "lex.h" #define curr source.cursor int getToken() {
char a[100];
int a_length, i, flag;
/*
*skip ' ','\n' and '\b'
*/
while (*curr == ' ' || *curr == 10 || *curr == 9) {
curr++;
if (*curr == END_OF_FILE) {
return -1;
}
}
/* name or keyword on first is a-z */
a_length=0;
if (*curr >= 'a' && *curr <= 'z') {
IDAndKey:
a_length = 0;
do {
a[a_length++] = *curr++;
} while ( isDigit(*curr) || isUpperLetter(*curr) || isLowerLetter(*curr)
|| *curr == '_');
a[a_length] = '\0';
i = 0;
flag = 0;
if (*a - 'a' <= 26 && *a - 'a' >= 0) {
while (strcmp(keywords[*a - 'a' + 1][i].keyName, "end") != 0) {
if (strcmp(keywords[*a - 'a' + 1][i].keyName, a) == 0) {
flag = 1;
break;
}
i++;
}
if (flag == 1) {
printf("keyword is %s\n", a);
return 1;
} else {
printf("Identify is %s\n", a);
return 1;
}
} else {
printf("Identify is %s\n", a);
return 1;
}
} else if (isUpperLetter(*curr)) {
goto IDAndKey;
} else if (isDigit(*curr)) {
a_length = 0;
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
//float number
if (*curr == '.') {
do {
a[a_length++] = *curr++;
} while (isDigit(*curr));
a[a_length] = '\0';
printf("float number is %s\n", a);
return 1;
} else {
// number
a[a_length] = '\0';
printf("number is %s\n", a);
return 1;
}
/*
* Operator begin
* */
} else if (*curr == '<') {
a[a_length++] = *curr++;
if (*curr == '<') {
a[a_length++] = *curr++;
lastOperatorDeal:
a[a_length] = '\0';
printf("Operator is %s\n", a);
return 1;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '>') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
} } else if (*curr == '=') {
a[a_length++] = *curr++;
if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
} else if (*curr == '(') {
singleOperator:
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == ')') {
goto singleOperator;
} else if (*curr == '[') {
goto singleOperator;
} else if (*curr == ']') {
goto singleOperator;
} else if (*curr == '-') {
a[a_length++] = *curr++;
if (*curr == '>') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '-') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else if (*curr == '=') {
a[a_length++] = *curr++;
goto lastOperatorDeal;
} else {
goto lastOperatorDeal;
}
}else if(*curr=='.'){
goto singleOperator;
}else if(*curr=='!'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='~'){
goto singleOperator;
}else if(*curr=='+'){
a[a_length++]=*curr++;
if(*curr=='+'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='-'){
a[a_length++]=*curr++;
if(*curr=='-'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else {
goto lastOperatorDeal;
}
}else if(*curr=='*'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='&'){
a[a_length++]=*curr++;
if(*curr=='&'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='/'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}if(*curr=='/'){
// skip line
while(*curr!='\n'){
if(*curr==END_OF_FILE)
return -1;
curr++;
}
}else if(*curr=='*'){
curr++;
// skip "/**/"
while(*curr!=END_OF_FILE)
{
if(*curr=='*' && *(curr+1)=='/'){
curr+=2;
break;
}
curr++;
}
}else{
goto lastOperatorDeal;
}
}else if(*curr=='%'){
a[a_length++]=*curr++;
if(*curr=='d'){
goto singleOperator;
}else if(*curr=='c'){
goto singleOperator;
}else if(*curr=='f'){
goto singleOperator;
}else if(*curr=='l'){
a[a_length++]=*curr++;
if(*curr=='d')
goto singleOperator;
else if(*curr=='f')
goto singleOperator;
else
goto singleOperator;
} }else if(*curr=='^'){
a[a_length++]=*curr++;
if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='|'){
a[a_length++]=*curr++;
if(*curr=='|'){
goto singleOperator;
}else if(*curr=='='){
goto singleOperator;
}else{
goto lastOperatorDeal;
}
}else if(*curr=='?'){
goto singleOperator;
}else if(*curr==':'){
goto singleOperator;
}else if(*curr==','){
goto singleOperator;
}else if(*curr=='\\'){
a[a_length++]=*curr++;
if(*curr=='n'){
goto singleOperator;
}else {
goto lastOperatorDeal;
} }
/*
* Operator end
* */
/*
* delimiter begin
* */
else if(*curr=='{'){
singleDelimiter:
a[a_length++]=*curr++;
a[a_length]='\0';
printf("Delimiter is %s\n", a);
return 1;
}else if(*curr=='}'){
goto singleDelimiter;
}else if(*curr==';'){
goto singleDelimiter;
}else if(*curr=='\''){
goto singleDelimiter;
}else if(*curr=='\"'){
goto singleDelimiter;
}
}
这里实现了将单词分成五类流,并将单词打印出来,在后面的语法分析中将会使用到这里的单词流结果。
忘了说了,我将自己写的编译器命名为:ZCC,头文件都包括在zcc.h中(*^__^*) 嘻嘻……,想写个类似与gcc 一样奇妙的玩意。
最后看測试文档:
struct Student{
int a;
char* name;
} int main()
{
int a=123;
float a2=1.2345677;
int b=1+3;
for(int i=0; i < 100; i++)
a+=i;
printf("%d\n", a);
return 0;
}
測试结果:
keyword is struct
Identify is Student
Delimiter is {
keyword is int
Identify is a
Delimiter is ;
keyword is char
Operator is *
Identify is name
Delimiter is ;
Delimiter is }
keyword is int
Identify is main
Operator is (
Operator is )
Delimiter is {
keyword is int
Identify is a
Operator is =
number is 123
Delimiter is ;
keyword is float
Identify is a2
Operator is =
float number is 1.2345677
Delimiter is ;
keyword is int
Identify is b
Operator is =
number is 1
Operator is +
number is 3
Delimiter is ;
keyword is for
Operator is (
keyword is int
Identify is i
Operator is =
number is 0
Delimiter is ;
Identify is i
Operator is <
number is 100
Delimiter is ;
Identify is i
Operator is ++
Operator is )
Identify is a
Operator is +=
Identify is i
Delimiter is ;
Identify is printf
Operator is (
Delimiter is "
Operator is %d
Operator is \n
Delimiter is "
Operator is ,
Identify is a
Operator is )
Delimiter is ;
keyword is return
number is 0
Delimiter is ;
Delimiter is }
做到这里,能够告一小段落了,接下来做的事情就是语法分析。