1 /++ 2 For reading a script into tokens 3 +/ 4 module qscript.compiler.tokengen; 5 6 import utils.misc; 7 import utils.lists; 8 import std.conv:to; 9 import qscript.compiler.misc; 10 11 /// stores errors for tokengen 12 private LinkedList!CompileError compileErrors; 13 14 /// Attempts to identify a token type by the token (string). 15 /// returns token type, if fails, throws exception 16 private Token.Type getTokenType(string token){ 17 /// Returns true if a string is a keyword 18 bool isKeyword(string s){ 19 return KEYWORDS.hasElement(s); 20 } 21 /// Returns true if a string is an identifier 22 bool isIdentifier(string s){ 23 // token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement 24 if (!token.isNum && !isKeyword(token)){ 25 return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS); 26 }else{ 27 return false; 28 } 29 } 30 /// Returns true is a string is an operator 31 bool isOperator(string s){ 32 return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s); 33 } 34 /// Returns true if string contains an integer 35 bool isInt(string s){ 36 return isNum(s, false); 37 } 38 /// Returns true if a string contains a double 39 /// 40 /// to be identified as a double, the number must have a decimal point in it 41 bool isDouble(string s){ 42 return isNum(s, true); 43 } 44 if (token == "."){ 45 return Token.Type.MemberSelector; 46 }else if (token == "="){ 47 return Token.Type.AssignmentOperator; 48 }else if (isInt(token)){ 49 return Token.Type.Integer; 50 }else if (isDouble(token)){ 51 return Token.Type.Double; 52 }else if (DATA_TYPES.hasElement(token)){ 53 return Token.Type.DataType; 54 }else if (isKeyword(token)){ 55 return Token.Type.Keyword; 56 }else if (isIdentifier(token)){ 57 return Token.Type.Identifier; 58 }else if (isOperator(token)){ 59 return Token.Type.Operator; 60 }else if (token[0] == '"'){ 61 return Token.Type.String; 62 }else if (token[0] == '\''){ 63 if (token.length < 3) 64 throw new Exception("no character provided inside ''"); 65 if (decodeString(token[1 .. $-1]).length > 1) 66 throw new Exception("'' can only hold 1 character"); 67 return Token.Type.Char; 68 }else if (token == ";"){ 69 return Token.Type.StatementEnd; 70 }else if (token == ","){ 71 return Token.Type.Comma; 72 }else if (token == "("){ 73 return Token.Type.ParanthesesOpen; 74 }else if (token == ")"){ 75 return Token.Type.ParanthesesClose; 76 }else if (token == "["){ 77 return Token.Type.IndexBracketOpen; 78 }else if (token == "]"){ 79 return Token.Type.IndexBracketClose; 80 }else if (token == "{"){ 81 return Token.Type.BlockStart; 82 }else if (token == "}"){ 83 return Token.Type.BlockEnd; 84 }else{ 85 throw new Exception("unidentified token type '"~token~'\''); 86 } 87 } 88 /// 89 unittest{ 90 assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier); 91 assert("24.5".getTokenType == Token.Type.Double); 92 assert("245".getTokenType == Token.Type.Integer); 93 assert("\"This is a string\"".getTokenType == Token.Type.String); 94 assert("==".getTokenType == Token.Type.Operator); 95 assert(";".getTokenType == Token.Type.StatementEnd); 96 assert(",".getTokenType == Token.Type.Comma); 97 assert("int".getTokenType == Token.Type.DataType); 98 assert("double".getTokenType == Token.Type.DataType); 99 assert("char".getTokenType == Token.Type.DataType); 100 assert("function".getTokenType == Token.Type.Keyword); 101 assert("if".getTokenType == Token.Type.Keyword); 102 assert("while".getTokenType == Token.Type.Keyword); 103 assert("else".getTokenType == Token.Type.Keyword); 104 assert(".".getTokenType == Token.Type.MemberSelector); 105 assert("\'p\'".getTokenType == Token.Type.Char); 106 } 107 108 /// returns Token[] with type identified based on string[] input 109 package Token[] stringToTokens(string[] s){ 110 Token[] r; 111 r.length = s.length; 112 foreach (i, token; s){ 113 //r[i].type = getTokenType(s[i]); 114 r[i].token = s[i].dup; 115 } 116 return r; 117 } 118 119 /// Reads script, and separates tokens 120 private TokenList separateTokens(string[] script){ 121 static bool isDifferent(char c, ref char[] token){ 122 static const SEPERATORS = ['(','[','{','}',']',')', ';', ',']; 123 static const WHITESPACE = [' ', '\t']; 124 static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number 125 static char pendingTokenChar = 0; /// as the name says... 126 if (pendingTokenChar != 0){ 127 token = [pendingTokenChar]; 128 pendingTokenChar = 0; 129 if (SEPERATORS.hasElement(token[0])){ 130 if (!WHITESPACE.hasElement(c)) 131 pendingTokenChar = c; 132 lastToken = token.dup; 133 return true; 134 } 135 } 136 if (token.length && ['"', '\''].hasElement(token[0])){ 137 token = token ~ c; 138 if (c == token[0] && token[$-1] != '\\'){ 139 lastToken = token.dup; 140 return true; 141 } 142 return false; 143 } 144 if (WHITESPACE.hasElement(c)){ 145 if (token.length > 0){ 146 lastToken = token.dup; 147 return true; 148 } 149 return false; 150 } 151 if (SEPERATORS.hasElement(c)){ 152 if (token.length == 0){ 153 token = [c]; 154 lastToken = token.dup; 155 return true; 156 } 157 pendingTokenChar = c; 158 lastToken = token.dup; 159 return true; 160 } 161 if (token.length > 0){ 162 // strings 163 if (token[0] == '\"' || token[0] == '\''){ 164 token = token ~ c; 165 if (c == token[0] && token[$-1] != '\\'){ 166 lastToken = token.dup; 167 return true; 168 } 169 return false; 170 } 171 // unexpected strings get read as separate tokens 172 if ((c == '\"' || c == '\'') && token[0] != c){ 173 pendingTokenChar = c; 174 lastToken = token.dup; 175 return true; 176 } 177 // space 178 if (c == ' ' || c == '\t'){ 179 lastToken = token.dup; 180 return true; 181 } 182 // - is operator or part of number 183 if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){ 184 token = token ~ c; 185 // go on 186 return false; 187 } 188 // . is memberSelector or decimal place 189 if (c == '.' && !isNum(cast(string)token, false)){ 190 lastToken = token; 191 pendingTokenChar = c; 192 return true; 193 } 194 // token is operator 195 if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){ 196 // see if it's still operator after adding c 197 if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){ 198 // go on 199 token = token ~ c; 200 return false; 201 }else{ 202 pendingTokenChar = c; 203 lastToken = token.dup; 204 return true; 205 } 206 }else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){ 207 // token not operator, c is operator 208 pendingTokenChar = c; 209 lastToken = token.dup; 210 return true; 211 } 212 } 213 // nothing else matches, just add it to end 214 token = token ~ c; 215 return false; 216 } 217 LinkedList!string tokens = new LinkedList!string; 218 uinteger[] tokenPerLine; 219 tokenPerLine.length = script.length; 220 uinteger tokenCount = 0; 221 foreach (lineno, line; script){ 222 integer stringEndIndex = -1; 223 char[] token = []; 224 for (uinteger i = 0; i < line.length; i ++){ 225 // skip strings 226 if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){ 227 stringEndIndex = line.strEnd(i); 228 if (stringEndIndex == -1){ 229 compileErrors.append(CompileError(lineno, "string not closed")); 230 break; 231 } 232 } 233 // break at comments 234 if (line[i] == '#' && cast(integer)i > stringEndIndex){ 235 isDifferent(' ', token); 236 // add pending token 237 if (token.length){ 238 tokens.append(cast(string)token.dup); 239 token = []; 240 } 241 break; 242 } 243 // hand this line[i] to isDifferent 244 if (isDifferent(line[i], token)){ 245 tokens.append(cast(string)token.dup); 246 token = []; 247 } 248 } 249 isDifferent(' ', token); 250 if (token.length) 251 tokens.append(cast(string)token.dup); 252 tokenPerLine[lineno] = tokens.count - tokenCount; 253 tokenCount += tokenPerLine[lineno]; 254 } 255 // put them all in TokenList 256 TokenList r; 257 r.tokenPerLine = tokenPerLine; // no need to dup it 258 r.tokens = stringToTokens(tokens.toArray); 259 .destroy (tokens); 260 return r; 261 } 262 /// 263 unittest{ 264 string[] script = [ 265 "function void main{", 266 "\tint i = 5;", 267 "\t.5sdfdf = (!5 - 5);", 268 "\ta.b.c = @a;", 269 "\ta = 5.5;", 270 " a = -20+5;", 271 " a=-20+5;", 272 " a == -b;", 273 "a <= b;", 274 "a > b", 275 ]; 276 Token[] tokens = separateTokens(script).tokens; 277 string[] strTokens; 278 strTokens.length = tokens.length; 279 foreach (i, tok; tokens){ 280 strTokens[i] = tok.token; 281 } 282 /*import std.stdio : writeln; 283 foreach(token; strTokens) 284 writeln(token);*/ 285 assert (strTokens == [ 286 "function", "void", "main", "{", 287 "int", "i", "=", "5", ";", 288 ".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";", 289 "a", ".", "b", ".", "c", "=", "@", "a", ";", 290 "a", "=", "5.5", ";", 291 "a", "=", "-20", "+", "5", ";", 292 "a", "=", "-20", "+", "5", ";", 293 "a", "==", "-", "b", ";", 294 "a", "<=", "b", ";", 295 "a", ">", "b" 296 ]); 297 } 298 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type 299 /// in an array 300 /// 301 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n 302 /// `errors` is the array to which erors will be put 303 /// 304 /// As a plus, it also checks if the brackets are in correct order (and properly closed) 305 package TokenList toTokens(string[] script, ref CompileError[] errors){ 306 compileErrors = new LinkedList!CompileError; 307 /// Returns true if a string has chars that only identifiers can have 308 TokenList tokens = separateTokens(script); 309 if (tokens.tokens == null || tokens.tokens.length == 0){ 310 // there's error 311 errors = compileErrors.toArray; 312 .destroy(compileErrors); 313 return tokens; 314 }else{ 315 // continue with identiying tokens 316 // fill in tokens with tokenStrings' strings, and identify their type 317 foreach(i, token; tokens.tokens){ 318 try{ 319 tokens.tokens[i].type = getTokenType(token.token); 320 }catch(Exception e){ 321 compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg)); 322 } 323 } 324 // now check brackets 325 tokens.checkBrackets(compileErrors); 326 if (compileErrors.count > 0){ 327 errors = compileErrors.toArray; 328 } 329 .destroy(compileErrors); 330 return tokens; 331 } 332 }