1 /++ 2 For reading a script into tokens 3 +/ 4 module qscript.compiler.tokengen; 5 6 import utils.misc; 7 import utils.lists; 8 import std.conv:to; 9 import qscript.compiler.compiler; 10 11 /// Each token is stored as a `Token` with the type and the actual token 12 package struct Token{ 13 /// Specifies type of token 14 /// 15 /// used only in `compiler.tokengen` 16 enum Type{ 17 String,/// That the token is: `"SOME STRING"` 18 Char, /// That the token is: `'C'` # Some character 19 Integer,/// That the token an int (or uint, or ubyte) 20 Double, /// That the token is a double (floating point) value 21 Bool, /// a true or false value 22 Identifier,/// That the token is an identifier. i.e token is a variable name or a function name. For a token to be marked as Identifier, it doesn't need to be defined in `new()` 23 DataType, /// the token is a data type 24 MemberSelector, /// a member selector operator 25 AssignmentOperator, /// and assignmentOperator 26 Operator,/// That the token is an operator, like `+`, `==` etc 27 Keyword,/// A `function` or `var` ... 28 Comma,/// That its a comma: `,` 29 StatementEnd,/// A semicolon 30 ParanthesesOpen,/// `(` 31 ParanthesesClose,/// `)` 32 IndexBracketOpen,/// `[` 33 IndexBracketClose,///`]` 34 BlockStart,///`{` 35 BlockEnd,///`}` 36 } 37 Type type;/// type of token 38 string token;/// token 39 this(Type tType, string tToken){ 40 type = tType; 41 token = tToken; 42 } 43 /// Identifies token type itself 44 /// 45 /// Throws: Exception if token invalid 46 this(string tToken){ 47 token = tToken; 48 type = getTokenType(token); 49 } 50 } 51 52 /// To store Tokens with Types where the line number of each token is required 53 package struct TokenList{ 54 Token[] tokens; /// Stores the tokens 55 uinteger[] tokenPerLine; /// Stores the number of tokens in each line 56 /// Returns the line number a token is in by usin the index of the token in `tokens` 57 /// 58 /// The returned value is the line-number, not index, it starts from 1, not zero 59 uinteger getTokenLine(uinteger tokenIndex){ 60 uinteger i = 0, chars = 0; 61 tokenIndex ++; 62 for (; chars < tokenIndex && i < tokenPerLine.length; i ++){ 63 chars += tokenPerLine[i]; 64 } 65 return i; 66 } 67 /// reads tokens into a string 68 static string toString(Token[] t){ 69 char[] r; 70 // set length 71 uinteger length = 0; 72 for (uinteger i = 0; i < t.length; i ++){ 73 length += t[i].token.length; 74 } 75 r.length = length; 76 // read em all into r; 77 for (uinteger i = 0, writeTo = 0; i < t.length; i ++){ 78 r[writeTo .. writeTo + t[i].token.length] = t[i].token; 79 writeTo += t[i].token.length; 80 } 81 return cast(string)r; 82 } 83 } 84 85 /// Attempts to identify a token type by the token (string). 86 /// returns token type, if fails, throws exception 87 private Token.Type getTokenType(string token){ 88 /// Returns true if a string is a keyword 89 bool isKeyword(string s){ 90 return KEYWORDS.hasElement(s); 91 } 92 /// Returns true if a string is an identifier 93 bool isIdentifier(string s){ 94 // token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement 95 if (!token.isNum && !isKeyword(token)){ 96 return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS); 97 }else{ 98 return false; 99 } 100 } 101 /// Returns true is a string is an operator 102 bool isOperator(string s){ 103 return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s); 104 } 105 /// Returns true if string contains an integer 106 bool isInt(string s){ 107 return isNum(s, false); 108 } 109 /// Returns true if a string contains a double 110 /// 111 /// to be identified as a double, the number must have a decimal point in it 112 bool isDouble(string s){ 113 return isNum(s, true); 114 } 115 if (token == "."){ 116 return Token.Type.MemberSelector; 117 }else if (token == "="){ 118 return Token.Type.AssignmentOperator; 119 }else if (isInt(token)){ 120 return Token.Type.Integer; 121 }else if (isDouble(token)){ 122 return Token.Type.Double; 123 }else if (token == "true" || token == "false"){ 124 return Token.Type.Bool; 125 }else if (DATA_TYPES.hasElement(token)){ 126 return Token.Type.DataType; 127 }else if (isKeyword(token)){ 128 return Token.Type.Keyword; 129 }else if (isIdentifier(token)){ 130 return Token.Type.Identifier; 131 }else if (isOperator(token)){ 132 return Token.Type.Operator; 133 }else if (token[0] == '"'){ 134 return Token.Type.String; 135 }else if (token[0] == '\''){ 136 if (token.length < 3) 137 throw new Exception("no character provided inside ''"); 138 if (decodeString(token[1 .. $-1]).length > 1) 139 throw new Exception("'' can only hold 1 character"); 140 return Token.Type.Char; 141 }else if (token == ";"){ 142 return Token.Type.StatementEnd; 143 }else if (token == ","){ 144 return Token.Type.Comma; 145 }else if (token == "("){ 146 return Token.Type.ParanthesesOpen; 147 }else if (token == ")"){ 148 return Token.Type.ParanthesesClose; 149 }else if (token == "["){ 150 return Token.Type.IndexBracketOpen; 151 }else if (token == "]"){ 152 return Token.Type.IndexBracketClose; 153 }else if (token == "{"){ 154 return Token.Type.BlockStart; 155 }else if (token == "}"){ 156 return Token.Type.BlockEnd; 157 }else{ 158 throw new Exception("unidentified token type '"~token~'\''); 159 } 160 } 161 /// 162 unittest{ 163 assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier); 164 assert("24.5".getTokenType == Token.Type.Double); 165 assert("245".getTokenType == Token.Type.Integer); 166 assert("\"This is a string\"".getTokenType == Token.Type.String); 167 assert("==".getTokenType == Token.Type.Operator); 168 assert(";".getTokenType == Token.Type.StatementEnd); 169 assert(",".getTokenType == Token.Type.Comma); 170 assert("int".getTokenType == Token.Type.DataType); 171 assert("double".getTokenType == Token.Type.DataType); 172 assert("char".getTokenType == Token.Type.DataType); 173 assert("function".getTokenType == Token.Type.Keyword); 174 assert("if".getTokenType == Token.Type.Keyword); 175 assert("while".getTokenType == Token.Type.Keyword); 176 assert("else".getTokenType == Token.Type.Keyword); 177 assert(".".getTokenType == Token.Type.MemberSelector); 178 assert("\'p\'".getTokenType == Token.Type.Char); 179 assert("null".getTokenType == Token.Type.Keyword); 180 assert("true".getTokenType == Token.Type.Bool); 181 assert("false".getTokenType == Token.Type.Bool); 182 } 183 184 /// returns Token[] with type identified based on string[] input 185 private Token[] stringToTokens(string[] s, bool detectType = true){ 186 Token[] r; 187 r.length = s.length; 188 foreach (i, token; s){ 189 if (detectType) 190 r[i].type = getTokenType(s[i]); 191 r[i].token = s[i].dup; 192 } 193 return r; 194 } 195 196 /// Reads script, and separates tokens 197 private TokenList separateTokens(string[] script, LinkedList!CompileError compileErrors){ 198 static bool isDifferent(char c, ref char[] token){ 199 static const SEPERATORS = ['(','[','{','}',']',')', ';', ',']; 200 static const WHITESPACE = [' ', '\t']; 201 static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number 202 static char pendingTokenChar = 0; /// as the name says... 203 if (pendingTokenChar != 0){ 204 token = [pendingTokenChar]; 205 pendingTokenChar = 0; 206 if (SEPERATORS.hasElement(token[0])){ 207 if (!WHITESPACE.hasElement(c)) 208 pendingTokenChar = c; 209 lastToken = token.dup; 210 return true; 211 } 212 } 213 if (token.length && ['"', '\''].hasElement(token[0])){ 214 token = token ~ c; 215 if (c == token[0] && token[$-2] != '\\'){ 216 lastToken = token.dup; 217 return true; 218 } 219 return false; 220 } 221 if (WHITESPACE.hasElement(c)){ 222 if (token.length > 0){ 223 lastToken = token.dup; 224 return true; 225 } 226 return false; 227 } 228 if (SEPERATORS.hasElement(c)){ 229 if (token.length == 0){ 230 token = [c]; 231 lastToken = token.dup; 232 return true; 233 } 234 pendingTokenChar = c; 235 lastToken = token.dup; 236 return true; 237 } 238 if (token.length > 0){ 239 // strings 240 if (token[0] == '"' || token[0] == '\''){ 241 token = token ~ c; 242 if (c == token[0] && token[$-1] != '\\'){ 243 lastToken = token.dup; 244 return true; 245 } 246 return false; 247 } 248 // unexpected strings get read as separate tokens 249 if ((c == '"' || c == '\'') && token[0] != c){ 250 pendingTokenChar = c; 251 lastToken = token.dup; 252 return true; 253 } 254 // space 255 if (c == ' ' || c == '\t'){ 256 lastToken = token.dup; 257 return true; 258 } 259 // - is operator or part of number 260 if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){ 261 token = token ~ c; 262 // go on 263 return false; 264 } 265 // . is memberSelector or decimal place 266 if (c == '.' && !isNum(cast(string)token, false)){ 267 lastToken = token; 268 pendingTokenChar = c; 269 return true; 270 } 271 // token is operator 272 if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){ 273 // see if it's still operator after adding c 274 if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){ 275 // go on 276 token = token ~ c; 277 return false; 278 }else{ 279 pendingTokenChar = c; 280 lastToken = token.dup; 281 return true; 282 } 283 }else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){ 284 // token not operator, c is operator 285 pendingTokenChar = c; 286 lastToken = token.dup; 287 return true; 288 } 289 } 290 // nothing else matches, just add it to end 291 token = token ~ c; 292 return false; 293 } 294 LinkedList!string tokens = new LinkedList!string; 295 uinteger[] tokenPerLine; 296 tokenPerLine.length = script.length; 297 uinteger tokenCount = 0; 298 foreach (lineno, line; script){ 299 integer stringEndIndex = -1; 300 char[] token = []; 301 for (uinteger i = 0; i < line.length; i ++){ 302 // skip strings 303 if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){ 304 stringEndIndex = line.strEnd(i); 305 if (stringEndIndex == -1){ 306 compileErrors.append(CompileError(lineno, "string not closed")); 307 break; 308 } 309 } 310 // break at comments 311 if (line[i] == '#' && cast(integer)i > stringEndIndex){ 312 isDifferent(' ', token); 313 // add pending token 314 if (token.length){ 315 tokens.append(cast(string)token.dup); 316 token = []; 317 } 318 break; 319 } 320 // hand this line[i] to isDifferent 321 if (isDifferent(line[i], token)){ 322 tokens.append(cast(string)token.dup); 323 token = []; 324 } 325 } 326 isDifferent(' ', token); 327 if (token.length) 328 tokens.append(cast(string)token.dup); 329 tokenPerLine[lineno] = tokens.count - tokenCount; 330 tokenCount += tokenPerLine[lineno]; 331 } 332 // put them all in TokenList 333 TokenList r; 334 r.tokenPerLine = tokenPerLine; // no need to dup it 335 r.tokens = stringToTokens(tokens.toArray, false); 336 .destroy (tokens); 337 return r; 338 } 339 /// 340 unittest{ 341 string[] script = [ 342 "function void main{", 343 "\tint i = 5;", 344 "\t.5sdfdf = (!5 - 5);", 345 "\ta.b.c = @a;", 346 "\ta = 5.5;", 347 " a = -20+5;", 348 " a=-20+5;", 349 " a == -b;", 350 "a <= b;", 351 "a > b", 352 "\"string\twith white\\\"Space\"\"unexpected string\"" 353 ]; 354 LinkedList!CompileError err = new LinkedList!CompileError; 355 Token[] tokens = separateTokens(script, err).tokens; 356 .destroy (err); 357 string[] strTokens; 358 strTokens.length = tokens.length; 359 foreach (i, tok; tokens){ 360 strTokens[i] = tok.token; 361 } 362 /*import std.stdio : writeln; 363 foreach(token; strTokens) 364 writeln(token);*/ 365 assert (strTokens == [ 366 "function", "void", "main", "{", 367 "int", "i", "=", "5", ";", 368 ".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";", 369 "a", ".", "b", ".", "c", "=", "@", "a", ";", 370 "a", "=", "5.5", ";", 371 "a", "=", "-20", "+", "5", ";", 372 "a", "=", "-20", "+", "5", ";", 373 "a", "==", "-", "b", ";", 374 "a", "<=", "b", ";", 375 "a", ">", "b", 376 "\"string\twith white\\\"Space\"", "\"unexpected string\"" 377 ]); 378 } 379 380 /// Returns the index of the quotation mark that ends a string 381 /// 382 /// Returns -1 if not found 383 private integer strEnd(string s, uinteger i){ 384 const char end = s[i] == '\'' ? '\'' : '\"'; 385 for (i++;i<s.length;i++){ 386 if (s[i]=='\\'){ 387 i++; 388 continue; 389 }else if (s[i]==end){ 390 break; 391 } 392 } 393 if (i==s.length){i=-1;} 394 return i; 395 } 396 397 private alias decodeString = strReplaceSpecial; 398 399 /// decodes a string. i.e, converts \t to tab, \" to ", etc 400 /// The string must not be surrounded by quoation marks 401 /// 402 /// Returns: string with special characters replaced with their actual characters (i.e, \t replaced with tab, \n with newline...) 403 private string strReplaceSpecial(char specialCharBegin='\\', char[char] map = ['t' : '\t', 'n' : '\n','\\':'\\']) 404 (string s){ 405 char[] r = []; 406 for (uinteger i = 0; i < s.length; i ++){ 407 if (s[i] == specialCharBegin && i + 1 < s.length && s[i+1] in map){ 408 r ~= map[s[i+1]]; 409 i++; 410 continue; 411 } 412 r ~= s[i]; 413 } 414 return r; 415 } 416 417 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type 418 /// in an array 419 /// 420 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n 421 /// `errors` is the array to which erors will be put 422 /// 423 /// As a plus, it also checks if the brackets are in correct order (and properly closed) 424 package TokenList toTokens(string[] script, ref CompileError[] errors){ 425 LinkedList!CompileError compileErrors = new LinkedList!CompileError; 426 /// Returns true if a string has chars that only identifiers can have 427 TokenList tokens = separateTokens(script, compileErrors); 428 if (tokens.tokens == null || tokens.tokens.length == 0){ 429 // there's error 430 errors = compileErrors.toArray; 431 .destroy(compileErrors); 432 return tokens; 433 }else{ 434 // continue with identiying tokens 435 // fill in tokens with tokenStrings' strings, and identify their type 436 foreach(i, token; tokens.tokens){ 437 try{ 438 tokens.tokens[i].type = getTokenType(token.token); 439 }catch(Exception e){ 440 compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg)); 441 } 442 } 443 // now check brackets 444 tokens.checkBrackets(compileErrors); 445 if (compileErrors.count > 0){ 446 errors = compileErrors.toArray; 447 } 448 .destroy(compileErrors); 449 return tokens; 450 } 451 } 452 453 454 /// Checks if the brackets in a tokenlist are in correct order, and are closed 455 /// 456 /// In case not, returns false, and appends error to `errorLog` 457 private bool checkBrackets(TokenList tokens, LinkedList!CompileError errors){ 458 enum BracketType{ 459 Round, 460 Square, 461 Block 462 } 463 BracketType[Token.Type] brackOpenIdent = [ 464 Token.Type.ParanthesesOpen: BracketType.Round, 465 Token.Type.IndexBracketOpen: BracketType.Square, 466 Token.Type.BlockStart: BracketType.Block 467 ]; 468 BracketType[Token.Type] brackCloseIdent = [ 469 Token.Type.ParanthesesClose: BracketType.Round, 470 Token.Type.IndexBracketClose: BracketType.Square, 471 Token.Type.BlockEnd:BracketType.Block 472 ]; 473 Stack!BracketType bracks = new Stack!BracketType; 474 Stack!uinteger bracksStartIndex = new Stack!uinteger; 475 BracketType curType; 476 bool r = true; 477 for (uinteger lastInd = tokens.tokens.length-1, i = 0; i<=lastInd; i++){ 478 if (tokens.tokens[i].type in brackOpenIdent){ 479 bracks.push(brackOpenIdent[tokens.tokens[i].type]); 480 bracksStartIndex.push(i); 481 }else if (tokens.tokens[i].type in brackCloseIdent){ 482 bracksStartIndex.pop(); 483 if (bracks.pop != brackCloseIdent[tokens.tokens[i].type]){ 484 errors.append(CompileError(tokens.getTokenLine(i), 485 "brackets order is wrong; first opened must be last closed")); 486 r = false; 487 break; 488 } 489 }else if (i == lastInd && bracks.count > 0){ 490 // no bracket ending 491 i = -2; 492 errors.append(CompileError(tokens.getTokenLine(bracksStartIndex.pop), "bracket not closed")); 493 r = false; 494 break; 495 } 496 } 497 498 .destroy(bracks); 499 .destroy(bracksStartIndex); 500 return r; 501 } 502 503 /// Returns index of closing/openinig bracket of the provided bracket 504 /// 505 /// `forward` if true, then the search is in forward direction, i.e, the closing bracket is searched for 506 /// `tokens` is the array of tokens to search in 507 /// `index` is the index of the opposite bracket 508 /// 509 /// It only works correctly if the brackets are in correct order, and the closing bracket is present 510 /// so, before calling this, `compiler.misc.checkBrackets` should be called 511 package uinteger tokenBracketPos(bool forward=true)(Token[] tokens, uinteger index){ 512 Token.Type[] closingBrackets = [ 513 Token.Type.BlockEnd, 514 Token.Type.IndexBracketClose, 515 Token.Type.ParanthesesClose 516 ]; 517 Token.Type[] openingBrackets = [ 518 Token.Type.BlockStart, 519 Token.Type.IndexBracketOpen, 520 Token.Type.ParanthesesOpen 521 ]; 522 uinteger count; // stores how many closing/opening brackets before we reach the desired one 523 uinteger i = index; 524 for (uinteger lastInd = (forward ? tokens.length : 0); i != lastInd; (forward ? i ++: i --)){ 525 if ((forward ? openingBrackets : closingBrackets).hasElement(tokens[i].type)){ 526 count ++; 527 }else if ((forward ? closingBrackets : openingBrackets).hasElement(tokens[i].type)){ 528 count --; 529 } 530 if (count == 0){ 531 break; 532 } 533 } 534 return i; 535 } 536 /// 537 unittest{ 538 Token[] tokens; 539 tokens = [ 540 Token(Token.Type.Comma, ","), Token(Token.Type.BlockStart,"{"), Token(Token.Type.Comma,","), 541 Token(Token.Type.IndexBracketOpen,"["),Token(Token.Type.IndexBracketClose,"]"),Token(Token.Type.BlockEnd,"}") 542 ]; 543 assert(tokens.tokenBracketPos!true(1) == 5); 544 assert(tokens.tokenBracketPos!false(5) == 1); 545 assert(tokens.tokenBracketPos!true(3) == 4); 546 assert(tokens.tokenBracketPos!false(4) == 3); 547 }