Nidzz.icu

这个人已经没救了,他的博客也很少更新...

有了自动机等理论基础,实现一个词法分析器并不算难,这也是整个编译器前端中最为简单的部分。
下面实现一个较为简单的C语言的词法分析器。

主要功能

  • 识别无符号二进制、16进制、10进制,二进制以0b开头,16进制则以0x开头
  • 能够识别C语言中的大部分关键字
  • 识别运算符、程序结束符等

DFA

识别数字的DFA

t

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
package zhou.lex;

import java.awt.event.KeyEvent;
import java.net.IDN;
import java.nio.channels.FileLockInterruptionException;
import java.util.ArrayList;
import java.util.HashMap;

public class Lexer {
private int rowNumber;
private int pointer;
private String text;
private char [] ctext;
private long tlen;
public HashMap<String, Integer> kw;//关键字
public final static char BLK = ' ';
public final static char TAB = '\t';
public final static char EOL='\0';
public final static char NL = '\n';
public final static char WNL = '\r';
private ArrayList<Token> tks;

public Lexer(String text){
this.text=text;
this.tlen = text.length();
this.ctext = text.toCharArray();
this.tks = new ArrayList<>();
kw = new HashMap<>();
kw.put("int",0);
kw.put("return",1);
kw.put("while",2);
kw.put("if",3);
kw.put("else",4);
kw.put("elif",5);
kw.put("double",6);
kw.put("float",7);
kw.put("long",8);
kw.put("void",9);
kw.put("boolean",10);
kw.put("for",11);
kw.put("char",12);

}
public void start(){
char c;
while (pointer<tlen){
c = ctext[pointer];
if (c==BLK||c==TAB||c=='"') pointer++;
else if (c==NL||c==WNL){
rowNumber++;
pointer++;
}else{
lexPart(c);
}
}

}
private void lexPart(char c){
if (isDigit(c)){
// 如果第一个字母是数字,
String tk = splOne();
if (isNumber(tk)){
tks.add(new Token(tk,Token.CNS,"常数"));
}else{// 非法标识符
try {
throw new TokenException(rowNumber,"“"+tk+"“是非法标识符");
} catch (TokenException e) {
e.printStackTrace();
}
}
}else if (isAlpha(c)){
/*
* 第一个是字母 则可能是 标识符,关键字
* */
String tk = splOne();
if (isIdentify(tk)){
if (isKeyword(tk)){
tks.add(new Token(tk,Token.KW,Token.C_KW));
}else{ // 标识符
tks.add(new Token(tk,Token.IDS,Token.C_IDS));
}
}else{
try {
throw new TokenException(rowNumber,"“"+tk+"”是非法的标识符!");
} catch (TokenException e) {
e.printStackTrace();
}
}

}else{
switch (c){
case '[':
tks.add(new Token("[",Token.ALP,Token.C_RP));
break;
case ']':
tks.add(new Token("]",Token.ARP,Token.C_RP));
break;
case ',':
tks.add(new Token(",",Token.SPL,Token.C_SPL));
break;
case ';':
tks.add(new Token(";",Token.SEMI,Token.C_CEMI));
break;
case '(':
tks.add(new Token("(",Token.SLP,Token.C_LP));
break;
case ')':
tks.add(new Token(")",Token.SRP,Token.C_LP));
break;
case '{':
tks.add(new Token("{",Token.LP,Token.C_LP));
break;
case '}':
tks.add(new Token("}",Token.RP,Token.C_LP));
break;
case '+':
/*
* 可能是 += ++ +
* */
if (pointer+1<tlen){
if (ctext[pointer+1]=='+'){ // ++
tks.add(new Token("++",Token.INC,Token.C_OPER));
pointer++;
}else if (ctext[pointer+1]=='='){//+=
tks.add(new Token("+=",Token.INCEQ,Token.C_ASSIGN));
pointer++;
}else {
tks.add(new Token("+",Token.PLUS,Token.C_OPER));
}
}
break;
case '-':
/*
* 可能是 -= ,--,-
* */
if (pointer+1<tlen){
if (ctext[pointer+1]=='-'){ // ++
tks.add(new Token("--",Token.DEC,Token.C_OPER));
pointer++;
}else if (ctext[pointer+1]=='='){//+=
tks.add(new Token("-=",Token.DECEQ,Token.C_OPER));
pointer++;
}else {
tks.add(new Token("-",Token.SUB,Token.C_OPER));
}
}else{
try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
}
break;
case '!':
/*
* 可能是 ! ,!=
* */
if (pointer<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("!=",Token.NE,Token.C_THAN));
pointer++;
}else {
tks.add(new Token("!",Token.NOT,Token.C_THAN));
}
}else {
try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
}
break;
case '=':
/*
* 可能是 =,==
* */
if (pointer<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("==",Token.EQ,Token.C_THAN));
pointer++;
}else{
tks.add(new Token("=",Token.ASSIGN,Token.C_ASSIGN));
}
}else {
try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
}
break;
case '<':
if (pointer+1<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("<=",Token.LESSEQ,Token.C_THAN));
pointer++;
}else {
tks.add(new Token("<",Token.LESSTHAN,Token.C_THAN));
}
}else try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
break;
case '>':
if (pointer+1<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token(">=",Token.THANEQ,Token.C_THAN));
pointer++;
}else {
tks.add(new Token(">",Token.THAN,Token.C_THAN));
}
}else try {
throw new TokenException(rowNumber,"语法错误");
} catch (TokenException e) {
e.printStackTrace();
}
break;
case '*':
/*
* 可能 *,*=,*IDS,**IDS,***IDS
*
* */
if (pointer+1<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("*=",Token.MULEQ,Token.C_ASSIGN));
pointer++;
}else {
tks.add(new Token("*",Token.MUL,Token.C_OPER));
}
}else try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
break;
case '/':
/*
* 可能是 /,//,/=,
* 多行注释 以/*开头以 start start/结尾
* */
if (pointer+1<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("/=",Token.DIVEQ,Token.C_ASSIGN));
pointer++;
}else if (ctext[pointer+1]=='/'){
while (pointer<tlen&&(ctext[pointer]!=NL&&ctext[pointer]!=WNL)) pointer++;
rowNumber++;
} else if (ctext[pointer+1]=='*'){ // 多行注释
while (pointer<tlen){
if (ctext[pointer]==NL||ctext[pointer]==WNL) rowNumber++;
if (ctext[pointer]=='/'&&ctext[pointer-1]=='*'&&ctext[pointer-2]=='*'){
break;
}
pointer++;
}
} else{
tks.add(new Token("/",Token.DIV,Token.C_OPER));
}
}else try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
break;
case '%':
if (pointer+1<tlen){
if (ctext[pointer+1]=='='){
tks.add(new Token("%=",Token.MODEQ,Token.C_ASSIGN));
pointer++;
}else {
tks.add(new Token("%",Token.MOD,Token.C_OPER));
}
}else try {
throw new TokenException(rowNumber,"语法错误!");
} catch (TokenException e) {
e.printStackTrace();
}
}
pointer++;
}
}
public String splOne(){
StringBuilder b= new StringBuilder();
while (pointer<tlen){
if (isAlpha(ctext[pointer])||isDigit(ctext[pointer])){
b.append(ctext[pointer]);
pointer++;
}else return b.toString();
}
return b.toString();
}
public boolean isNumber(String t) {
char ct[] = t.toCharArray();
char c = ct[0];
if (isDigit(c)){
if (c=='0'){
if (t.length()==1) return true;
/*
* 可能是 0b 0x
* */
if (ct[1]=='x'){ // 16进制前缀
if (isHexNumber(ct,2)){
return true;
}else {
try {
throw new TokenException(rowNumber,"“"+t+"”是非法的16进制数");
} catch (TokenException e) {
e.printStackTrace();
}
}
}else if (ct[1]=='b'){//二进制前缀

if (isBinaryNumber(ct,2)){
return true;
}else try {
throw new TokenException(rowNumber,"“"+t+"”是非法的2进制数");
} catch (TokenException e) {
e.printStackTrace();
}
}else{ //十进制
if (isDecimal(ct,1)){
return true;
}else try {
throw new TokenException(rowNumber,"“"+t+"”是非法的10进制数");
} catch (TokenException e) {
e.printStackTrace();
}
}
}else{ // 1-9
if (isDecimal(ct,0)){
return true;
}else try {
throw new TokenException(rowNumber,"“"+t+"”是非法的10进制数");
} catch (TokenException e) {
e.printStackTrace();
}
}
}
return false;
}

/**
*
* @param ct 待分析字符数组 ex: {''}
* @param p
* @return
*/
private boolean isHexNumber(char [] ct,int p){

boolean f=false;
for(int i=p;i<ct.length;i++){
if (isDigit(ct[i])||(ct[i]>='a'&&ct[i]<='f')||(ct[i]>='A'&&ct[i]<='F')) f=true;
else return false;
}
return f;
}
private boolean isBinaryNumber(char [] ct,int p){
boolean f=false;
for(int i=p;i<ct.length;i++){
if (ct[i]=='1'||ct[i]=='0') f=true;
else return false;
}
return f;
}
private boolean isDecimal(char [] ct,int p){
boolean f=false;
for(int i=p;i<ct.length;i++){
if (isDigit(ct[i])) f=true;
else return false;
}
return f;
}
private boolean isAlpha(char c){
if ((c>='a'&&c<='z')||(c>='A'&&c<='Z')) return true;
return false;
}
private boolean isDigit(char c){
if (c>='0'&&c<='9') return true;
return false;
}
public boolean isIdentify(String tk){
char [] ctk = tk.toCharArray();
if (!isAlpha(ctk[0])&&ctk[0]!='_') return false;
else if (tk.length()==1) return true;
boolean f=false;
for(int i=1;i<ctk.length;i++){
if (isAlpha(ctk[i])||ctk[i]=='_'||isDigit(ctk[i])) f = true;
else return false;
}
return f;
}
public boolean isKeyword(String tk){
return kw.containsKey(tk);
}

public ArrayList<Token> getTks() {
return tks;
}
}

问题

我想从硬盘里引导启动,但将一些数据放在了USB-HDD(U盘做成的)里,需要读取。我的物理机上有两块固定硬盘,加上U盘总的也就三块,按理来说直接调用int 13h 8x(8x为U盘模拟成的HDD的驱动器号)就可以直接读取了,但是读取不到(从80-85H都试过了)。由于这方面的资料太少,索性记录一下