1 /++
2 For reading a script into tokens
3 +/
4 module qscript.compiler.tokengen;
5
6 import utils.misc;
7 import utils.lists;
8 import std.conv:to;
9 import qscript.compiler.compiler;
10
11 /// Each token is stored as a `Token` with the type and the actual token
12 package struct Token{
13 /// Specifies type of token
14 ///
15 /// used only in `compiler.tokengen`
16 enum Type{
17 String,/// That the token is: `"SOME STRING"`
18 Char, /// That the token is: `'C'` # Some character
19 Integer,/// That the token an int (or uint, or ubyte)
20 Double, /// That the token is a double (floating point) value
21 Bool, /// a true or false value
22 Identifier,/// That the token is an identifier. i.e token is a variable name or a function name. For a token to be marked as Identifier, it doesn't need to be defined in `new()`
23 DataType, /// the token is a data type
24 MemberSelector, /// a member selector operator
25 AssignmentOperator, /// and assignmentOperator
26 Operator,/// That the token is an operator, like `+`, `==` etc
27 Keyword,/// A `function` or `var` ...
28 Comma,/// That its a comma: `,`
29 StatementEnd,/// A semicolon
30 ParanthesesOpen,/// `(`
31 ParanthesesClose,/// `)`
32 IndexBracketOpen,/// `[`
33 IndexBracketClose,///`]`
34 BlockStart,///`{`
35 BlockEnd,///`}`
36 }
37 Type type;/// type of token
38 string token;/// token
39 this(Type tType, string tToken){
40 type = tType;
41 token = tToken;
42 }
43 /// Identifies token type itself
44 ///
45 /// Throws: Exception if token invalid
46 this(string tToken){
47 token = tToken;
48 type = getTokenType(token);
49 }
50 }
51
52 /// To store Tokens with Types where the line number of each token is required
53 package struct TokenList{
54 Token[] tokens; /// Stores the tokens
55 uinteger[] tokenPerLine; /// Stores the number of tokens in each line
56 /// Returns the line number a token is in by usin the index of the token in `tokens`
57 ///
58 /// The returned value is the line-number, not index, it starts from 1, not zero
59 uinteger getTokenLine(uinteger tokenIndex){
60 uinteger i = 0, chars = 0;
61 tokenIndex ++;
62 for (; chars < tokenIndex && i < tokenPerLine.length; i ++){
63 chars += tokenPerLine[i];
64 }
65 return i;
66 }
67 /// reads tokens into a string
68 static string toString(Token[] t){
69 char[] r;
70 // set length
71 uinteger length = 0;
72 for (uinteger i = 0; i < t.length; i ++){
73 length += t[i].token.length;
74 }
75 r.length = length;
76 // read em all into r;
77 for (uinteger i = 0, writeTo = 0; i < t.length; i ++){
78 r[writeTo .. writeTo + t[i].token.length] = t[i].token;
79 writeTo += t[i].token.length;
80 }
81 return cast(string)r;
82 }
83 }
84
85 /// Attempts to identify a token type by the token (string).
86 /// returns token type, if fails, throws exception
87 private Token.Type getTokenType(string token){
88 /// Returns true if a string is a keyword
89 bool isKeyword(string s){
90 return KEYWORDS.hasElement(s);
91 }
92 /// Returns true if a string is an identifier
93 bool isIdentifier(string s){
94 // token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement
95 if (!token.isNum && !isKeyword(token)){
96 return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS);
97 }else{
98 return false;
99 }
100 }
101 /// Returns true is a string is an operator
102 bool isOperator(string s){
103 return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s);
104 }
105 /// Returns true if string contains an integer
106 bool isInt(string s){
107 return isNum(s, false);
108 }
109 /// Returns true if a string contains a double
110 ///
111 /// to be identified as a double, the number must have a decimal point in it
112 bool isDouble(string s){
113 return isNum(s, true);
114 }
115 if (token == "."){
116 return Token.Type.MemberSelector;
117 }else if (token == "="){
118 return Token.Type.AssignmentOperator;
119 }else if (isInt(token)){
120 return Token.Type.Integer;
121 }else if (isDouble(token)){
122 return Token.Type.Double;
123 }else if (token == "true" || token == "false"){
124 return Token.Type.Bool;
125 }else if (DATA_TYPES.hasElement(token)){
126 return Token.Type.DataType;
127 }else if (isKeyword(token)){
128 return Token.Type.Keyword;
129 }else if (isIdentifier(token)){
130 return Token.Type.Identifier;
131 }else if (isOperator(token)){
132 return Token.Type.Operator;
133 }else if (token[0] == '"'){
134 return Token.Type.String;
135 }else if (token[0] == '\''){
136 if (token.length < 3)
137 throw new Exception("no character provided inside ''");
138 if (decodeString(token[1 .. $-1]).length > 1)
139 throw new Exception("'' can only hold 1 character");
140 return Token.Type.Char;
141 }else if (token == ";"){
142 return Token.Type.StatementEnd;
143 }else if (token == ","){
144 return Token.Type.Comma;
145 }else if (token == "("){
146 return Token.Type.ParanthesesOpen;
147 }else if (token == ")"){
148 return Token.Type.ParanthesesClose;
149 }else if (token == "["){
150 return Token.Type.IndexBracketOpen;
151 }else if (token == "]"){
152 return Token.Type.IndexBracketClose;
153 }else if (token == "{"){
154 return Token.Type.BlockStart;
155 }else if (token == "}"){
156 return Token.Type.BlockEnd;
157 }else{
158 throw new Exception("unidentified token type '"~token~'\'');
159 }
160 }
161 ///
162 unittest{
163 assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier);
164 assert("24.5".getTokenType == Token.Type.Double);
165 assert("245".getTokenType == Token.Type.Integer);
166 assert("\"This is a string\"".getTokenType == Token.Type.String);
167 assert("==".getTokenType == Token.Type.Operator);
168 assert(";".getTokenType == Token.Type.StatementEnd);
169 assert(",".getTokenType == Token.Type.Comma);
170 assert("int".getTokenType == Token.Type.DataType);
171 assert("double".getTokenType == Token.Type.DataType);
172 assert("char".getTokenType == Token.Type.DataType);
173 assert("function".getTokenType == Token.Type.Keyword);
174 assert("if".getTokenType == Token.Type.Keyword);
175 assert("while".getTokenType == Token.Type.Keyword);
176 assert("else".getTokenType == Token.Type.Keyword);
177 assert(".".getTokenType == Token.Type.MemberSelector);
178 assert("\'p\'".getTokenType == Token.Type.Char);
179 assert("null".getTokenType == Token.Type.Keyword);
180 assert("true".getTokenType == Token.Type.Bool);
181 assert("false".getTokenType == Token.Type.Bool);
182 }
183
184 /// returns Token[] with type identified based on string[] input
185 private Token[] stringToTokens(string[] s, bool detectType = true){
186 Token[] r;
187 r.length = s.length;
188 foreach (i, token; s){
189 if (detectType)
190 r[i].type = getTokenType(s[i]);
191 r[i].token = s[i].dup;
192 }
193 return r;
194 }
195
196 /// Reads script, and separates tokens
197 private TokenList separateTokens(string[] script, LinkedList!CompileError compileErrors){
198 static bool isDifferent(char c, ref char[] token){
199 static const SEPERATORS = ['(','[','{','}',']',')', ';', ','];
200 static const WHITESPACE = [' ', '\t'];
201 static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number
202 static char pendingTokenChar = 0; /// as the name says...
203 if (pendingTokenChar != 0){
204 token = [pendingTokenChar];
205 pendingTokenChar = 0;
206 if (SEPERATORS.hasElement(token[0])){
207 if (!WHITESPACE.hasElement(c))
208 pendingTokenChar = c;
209 lastToken = token.dup;
210 return true;
211 }
212 }
213 if (token.length && ['"', '\''].hasElement(token[0])){
214 token = token ~ c;
215 if (c == token[0] && token[$-2] != '\\'){
216 lastToken = token.dup;
217 return true;
218 }
219 return false;
220 }
221 if (WHITESPACE.hasElement(c)){
222 if (token.length > 0){
223 lastToken = token.dup;
224 return true;
225 }
226 return false;
227 }
228 if (SEPERATORS.hasElement(c)){
229 if (token.length == 0){
230 token = [c];
231 lastToken = token.dup;
232 return true;
233 }
234 pendingTokenChar = c;
235 lastToken = token.dup;
236 return true;
237 }
238 if (token.length > 0){
239 // strings
240 if (token[0] == '"' || token[0] == '\''){
241 token = token ~ c;
242 if (c == token[0] && token[$-1] != '\\'){
243 lastToken = token.dup;
244 return true;
245 }
246 return false;
247 }
248 // unexpected strings get read as separate tokens
249 if ((c == '"' || c == '\'') && token[0] != c){
250 pendingTokenChar = c;
251 lastToken = token.dup;
252 return true;
253 }
254 // space
255 if (c == ' ' || c == '\t'){
256 lastToken = token.dup;
257 return true;
258 }
259 // - is operator or part of number
260 if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){
261 token = token ~ c;
262 // go on
263 return false;
264 }
265 // . is memberSelector or decimal place
266 if (c == '.' && !isNum(cast(string)token, false)){
267 lastToken = token;
268 pendingTokenChar = c;
269 return true;
270 }
271 // token is operator
272 if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){
273 // see if it's still operator after adding c
274 if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){
275 // go on
276 token = token ~ c;
277 return false;
278 }else{
279 pendingTokenChar = c;
280 lastToken = token.dup;
281 return true;
282 }
283 }else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){
284 // token not operator, c is operator
285 pendingTokenChar = c;
286 lastToken = token.dup;
287 return true;
288 }
289 }
290 // nothing else matches, just add it to end
291 token = token ~ c;
292 return false;
293 }
294 LinkedList!string tokens = new LinkedList!string;
295 uinteger[] tokenPerLine;
296 tokenPerLine.length = script.length;
297 uinteger tokenCount = 0;
298 foreach (lineno, line; script){
299 integer stringEndIndex = -1;
300 char[] token = [];
301 for (uinteger i = 0; i < line.length; i ++){
302 // skip strings
303 if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){
304 stringEndIndex = line.strEnd(i);
305 if (stringEndIndex == -1){
306 compileErrors.append(CompileError(lineno, "string not closed"));
307 break;
308 }
309 }
310 // break at comments
311 if (line[i] == '#' && cast(integer)i > stringEndIndex){
312 isDifferent(' ', token);
313 // add pending token
314 if (token.length){
315 tokens.append(cast(string)token.dup);
316 token = [];
317 }
318 break;
319 }
320 // hand this line[i] to isDifferent
321 if (isDifferent(line[i], token)){
322 tokens.append(cast(string)token.dup);
323 token = [];
324 }
325 }
326 isDifferent(' ', token);
327 if (token.length)
328 tokens.append(cast(string)token.dup);
329 tokenPerLine[lineno] = tokens.count - tokenCount;
330 tokenCount += tokenPerLine[lineno];
331 }
332 // put them all in TokenList
333 TokenList r;
334 r.tokenPerLine = tokenPerLine; // no need to dup it
335 r.tokens = stringToTokens(tokens.toArray, false);
336 .destroy (tokens);
337 return r;
338 }
339 ///
340 unittest{
341 string[] script = [
342 "function void main{",
343 "\tint i = 5;",
344 "\t.5sdfdf = (!5 - 5);",
345 "\ta.b.c = @a;",
346 "\ta = 5.5;",
347 " a = -20+5;",
348 " a=-20+5;",
349 " a == -b;",
350 "a <= b;",
351 "a > b",
352 "\"string\twith white\\\"Space\"\"unexpected string\""
353 ];
354 LinkedList!CompileError err = new LinkedList!CompileError;
355 Token[] tokens = separateTokens(script, err).tokens;
356 .destroy (err);
357 string[] strTokens;
358 strTokens.length = tokens.length;
359 foreach (i, tok; tokens){
360 strTokens[i] = tok.token;
361 }
362 /*import std.stdio : writeln;
363 foreach(token; strTokens)
364 writeln(token);*/
365 assert (strTokens == [
366 "function", "void", "main", "{",
367 "int", "i", "=", "5", ";",
368 ".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";",
369 "a", ".", "b", ".", "c", "=", "@", "a", ";",
370 "a", "=", "5.5", ";",
371 "a", "=", "-20", "+", "5", ";",
372 "a", "=", "-20", "+", "5", ";",
373 "a", "==", "-", "b", ";",
374 "a", "<=", "b", ";",
375 "a", ">", "b",
376 "\"string\twith white\\\"Space\"", "\"unexpected string\""
377 ]);
378 }
379
380 /// Returns the index of the quotation mark that ends a string
381 ///
382 /// Returns -1 if not found
383 private integer strEnd(string s, uinteger i){
384 const char end = s[i] == '\'' ? '\'' : '\"';
385 for (i++;i<s.length;i++){
386 if (s[i]=='\\'){
387 i++;
388 continue;
389 }else if (s[i]==end){
390 break;
391 }
392 }
393 if (i==s.length){i=-1;}
394 return i;
395 }
396
397 private alias decodeString = strReplaceSpecial;
398
399 /// decodes a string. i.e, converts \t to tab, \" to ", etc
400 /// The string must not be surrounded by quoation marks
401 ///
402 /// Returns: string with special characters replaced with their actual characters (i.e, \t replaced with tab, \n with newline...)
403 private string strReplaceSpecial(char specialCharBegin='\\', char[char] map = ['t' : '\t', 'n' : '\n','\\':'\\'])
404 (string s){
405 char[] r = [];
406 for (uinteger i = 0; i < s.length; i ++){
407 if (s[i] == specialCharBegin && i + 1 < s.length && s[i+1] in map){
408 r ~= map[s[i+1]];
409 i++;
410 continue;
411 }
412 r ~= s[i];
413 }
414 return r;
415 }
416
417 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type
418 /// in an array
419 ///
420 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n
421 /// `errors` is the array to which erors will be put
422 ///
423 /// As a plus, it also checks if the brackets are in correct order (and properly closed)
424 package TokenList toTokens(string[] script, ref CompileError[] errors){
425 LinkedList!CompileError compileErrors = new LinkedList!CompileError;
426 /// Returns true if a string has chars that only identifiers can have
427 TokenList tokens = separateTokens(script, compileErrors);
428 if (tokens.tokens == null || tokens.tokens.length == 0){
429 // there's error
430 errors = compileErrors.toArray;
431 .destroy(compileErrors);
432 return tokens;
433 }else{
434 // continue with identiying tokens
435 // fill in tokens with tokenStrings' strings, and identify their type
436 foreach(i, token; tokens.tokens){
437 try{
438 tokens.tokens[i].type = getTokenType(token.token);
439 }catch(Exception e){
440 compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg));
441 }
442 }
443 // now check brackets
444 tokens.checkBrackets(compileErrors);
445 if (compileErrors.count > 0){
446 errors = compileErrors.toArray;
447 }
448 .destroy(compileErrors);
449 return tokens;
450 }
451 }
452
453
454 /// Checks if the brackets in a tokenlist are in correct order, and are closed
455 ///
456 /// In case not, returns false, and appends error to `errorLog`
457 private bool checkBrackets(TokenList tokens, LinkedList!CompileError errors){
458 enum BracketType{
459 Round,
460 Square,
461 Block
462 }
463 BracketType[Token.Type] brackOpenIdent = [
464 Token.Type.ParanthesesOpen: BracketType.Round,
465 Token.Type.IndexBracketOpen: BracketType.Square,
466 Token.Type.BlockStart: BracketType.Block
467 ];
468 BracketType[Token.Type] brackCloseIdent = [
469 Token.Type.ParanthesesClose: BracketType.Round,
470 Token.Type.IndexBracketClose: BracketType.Square,
471 Token.Type.BlockEnd:BracketType.Block
472 ];
473 Stack!BracketType bracks = new Stack!BracketType;
474 Stack!uinteger bracksStartIndex = new Stack!uinteger;
475 BracketType curType;
476 bool r = true;
477 for (uinteger lastInd = tokens.tokens.length-1, i = 0; i<=lastInd; i++){
478 if (tokens.tokens[i].type in brackOpenIdent){
479 bracks.push(brackOpenIdent[tokens.tokens[i].type]);
480 bracksStartIndex.push(i);
481 }else if (tokens.tokens[i].type in brackCloseIdent){
482 bracksStartIndex.pop();
483 if (bracks.pop != brackCloseIdent[tokens.tokens[i].type]){
484 errors.append(CompileError(tokens.getTokenLine(i),
485 "brackets order is wrong; first opened must be last closed"));
486 r = false;
487 break;
488 }
489 }else if (i == lastInd && bracks.count > 0){
490 // no bracket ending
491 i = -2;
492 errors.append(CompileError(tokens.getTokenLine(bracksStartIndex.pop), "bracket not closed"));
493 r = false;
494 break;
495 }
496 }
497
498 .destroy(bracks);
499 .destroy(bracksStartIndex);
500 return r;
501 }
502
503 /// Returns index of closing/openinig bracket of the provided bracket
504 ///
505 /// `forward` if true, then the search is in forward direction, i.e, the closing bracket is searched for
506 /// `tokens` is the array of tokens to search in
507 /// `index` is the index of the opposite bracket
508 ///
509 /// It only works correctly if the brackets are in correct order, and the closing bracket is present
510 /// so, before calling this, `compiler.misc.checkBrackets` should be called
511 package uinteger tokenBracketPos(bool forward=true)(Token[] tokens, uinteger index){
512 Token.Type[] closingBrackets = [
513 Token.Type.BlockEnd,
514 Token.Type.IndexBracketClose,
515 Token.Type.ParanthesesClose
516 ];
517 Token.Type[] openingBrackets = [
518 Token.Type.BlockStart,
519 Token.Type.IndexBracketOpen,
520 Token.Type.ParanthesesOpen
521 ];
522 uinteger count; // stores how many closing/opening brackets before we reach the desired one
523 uinteger i = index;
524 for (uinteger lastInd = (forward ? tokens.length : 0); i != lastInd; (forward ? i ++: i --)){
525 if ((forward ? openingBrackets : closingBrackets).hasElement(tokens[i].type)){
526 count ++;
527 }else if ((forward ? closingBrackets : openingBrackets).hasElement(tokens[i].type)){
528 count --;
529 }
530 if (count == 0){
531 break;
532 }
533 }
534 return i;
535 }
536 ///
537 unittest{
538 Token[] tokens;
539 tokens = [
540 Token(Token.Type.Comma, ","), Token(Token.Type.BlockStart,"{"), Token(Token.Type.Comma,","),
541 Token(Token.Type.IndexBracketOpen,"["),Token(Token.Type.IndexBracketClose,"]"),Token(Token.Type.BlockEnd,"}")
542 ];
543 assert(tokens.tokenBracketPos!true(1) == 5);
544 assert(tokens.tokenBracketPos!false(5) == 1);
545 assert(tokens.tokenBracketPos!true(3) == 4);
546 assert(tokens.tokenBracketPos!false(4) == 3);
547 }