1 /++ 2 For reading a script into tokens 3 +/ 4 module qscript.compiler.tokengen; 5 6 import utils.misc; 7 import utils.lists; 8 import std.conv:to; 9 import qscript.compiler.misc; 10 11 /// stores errors for tokengen 12 private LinkedList!CompileError compileErrors; 13 14 /// Attempts to identify a token type by the token (string). 15 /// returns token type, if fails, throws exception 16 private Token.Type getTokenType(string token){ 17 /// Returns true if a string is a keyword 18 bool isKeyword(string s){ 19 return KEYWORDS.hasElement(s); 20 } 21 /// Returns true if a string is an identifier 22 bool isIdentifier(string s){ 23 // token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement 24 if (!token.isNum && !isKeyword(token)){ 25 return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS); 26 }else{ 27 return false; 28 } 29 } 30 /// Returns true is a string is an operator 31 bool isOperator(string s){ 32 return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s); 33 } 34 /// Returns true if string contains an integer 35 bool isInt(string s){ 36 return isNum(s, false); 37 } 38 /// Returns true if a string contains a double 39 /// 40 /// to be identified as a double, the number must have a decimal point in it 41 bool isDouble(string s){ 42 return isNum(s, true); 43 } 44 if (token == "."){ 45 return Token.Type.MemberSelector; 46 }else if (token == "="){ 47 return Token.Type.AssignmentOperator; 48 }else if (isInt(token)){ 49 return Token.Type.Integer; 50 }else if (isDouble(token)){ 51 return Token.Type.Double; 52 }else if (DATA_TYPES.hasElement(token)){ 53 return Token.Type.DataType; 54 }else if (isKeyword(token)){ 55 return Token.Type.Keyword; 56 }else if (isIdentifier(token)){ 57 return Token.Type.Identifier; 58 }else if (isOperator(token)){ 59 return Token.Type.Operator; 60 }else if (token[0] == '"'){ 61 return Token.Type.String; 62 }else if (token[0] == '\''){ 63 if (token.length < 3) 64 throw new Exception("no character provided inside ''"); 65 if (decodeString(token[1 .. $-1]).length > 1) 66 throw new Exception("'' can only hold 1 character"); 67 return Token.Type.Char; 68 }else if (token == ";"){ 69 return Token.Type.StatementEnd; 70 }else if (token == ","){ 71 return Token.Type.Comma; 72 }else if (token == "("){ 73 return Token.Type.ParanthesesOpen; 74 }else if (token == ")"){ 75 return Token.Type.ParanthesesClose; 76 }else if (token == "["){ 77 return Token.Type.IndexBracketOpen; 78 }else if (token == "]"){ 79 return Token.Type.IndexBracketClose; 80 }else if (token == "{"){ 81 return Token.Type.BlockStart; 82 }else if (token == "}"){ 83 return Token.Type.BlockEnd; 84 }else{ 85 throw new Exception("unidentified token type '"~token~'\''); 86 } 87 } 88 /// 89 unittest{ 90 assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier); 91 assert("24.5".getTokenType == Token.Type.Double); 92 assert("245".getTokenType == Token.Type.Integer); 93 assert("\"This is a string\"".getTokenType == Token.Type.String); 94 assert("==".getTokenType == Token.Type.Operator); 95 assert(";".getTokenType == Token.Type.StatementEnd); 96 assert(",".getTokenType == Token.Type.Comma); 97 assert("int".getTokenType == Token.Type.DataType); 98 assert("double".getTokenType == Token.Type.DataType); 99 assert("char".getTokenType == Token.Type.DataType); 100 assert("function".getTokenType == Token.Type.Keyword); 101 assert("if".getTokenType == Token.Type.Keyword); 102 assert("while".getTokenType == Token.Type.Keyword); 103 assert("else".getTokenType == Token.Type.Keyword); 104 assert(".".getTokenType == Token.Type.MemberSelector); 105 assert("\'p\'".getTokenType == Token.Type.Char); 106 } 107 108 /// returns Token[] with type identified based on string[] input 109 package Token[] stringToTokens(string[] s){ 110 Token[] r; 111 r.length = s.length; 112 foreach (i, token; s){ 113 r[i].type = getTokenType(s[i]); 114 r[i].token = s[i].dup; 115 } 116 return r; 117 } 118 119 /// Reads script, and separates tokens 120 private TokenList separateTokens(string[] script){ 121 static bool isDifferent(char c, ref char[] token){ 122 static const SEPERATORS = ['(','[','{','}',']',')', ';', ',']; 123 static const WHITESPACE = [' ', '\t']; 124 static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number 125 static char pendingTokenChar = 0; /// as the name says... 126 if (pendingTokenChar != 0){ 127 token = [pendingTokenChar]; 128 pendingTokenChar = 0; 129 if (SEPERATORS.hasElement(token[0])){ 130 if (!WHITESPACE.hasElement(c)) 131 pendingTokenChar = c; 132 lastToken = token.dup; 133 return true; 134 } 135 } 136 if (WHITESPACE.hasElement(c)){ 137 if (token.length > 0){ 138 lastToken = token.dup; 139 return true; 140 } 141 return false; 142 } 143 if (SEPERATORS.hasElement(c)){ 144 if (token.length == 0){ 145 token = [c]; 146 lastToken = token.dup; 147 return true; 148 } 149 pendingTokenChar = c; 150 lastToken = token.dup; 151 return true; 152 } 153 if (token.length > 0){ 154 // strings 155 if (token[0] == '\"' || token[0] == '\''){ 156 token = token ~ c; 157 if (c == token[0] && token[$-1] != '\\'){ 158 lastToken = token.dup; 159 return true; 160 } 161 } 162 // unexpected strings get read as separate tokens 163 if ((c == '\"' || c == '\'') && token[0] != c){ 164 pendingTokenChar = c; 165 lastToken = token.dup; 166 return true; 167 } 168 // space 169 if (c == ' ' || c == '\t'){ 170 lastToken = token.dup; 171 return true; 172 } 173 // - is operator or part of number 174 if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){ 175 token = token ~ c; 176 // go on 177 return false; 178 } 179 // . is memberSelector or decimal place 180 if (c == '.' && !isNum(cast(string)token, false)){ 181 lastToken = token; 182 pendingTokenChar = c; 183 return true; 184 } 185 // token is operator 186 if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){ 187 // see if it's still operator after adding c 188 if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){ 189 // go on 190 token = token ~ c; 191 return false; 192 }else{ 193 pendingTokenChar = c; 194 lastToken = token.dup; 195 return true; 196 } 197 }else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){ 198 // token not operator, c is operator 199 pendingTokenChar = c; 200 lastToken = token.dup; 201 return true; 202 } 203 } 204 // nothing else matches, just add it to end 205 token = token ~ c; 206 return false; 207 } 208 LinkedList!string tokens = new LinkedList!string; 209 uinteger[] tokenPerLine; 210 tokenPerLine.length = script.length; 211 uinteger tokenCount = 0; 212 foreach (lineno, line; script){ 213 integer stringEndIndex = -1; 214 char[] token = []; 215 for (uinteger i = 0; i < line.length; i ++){ 216 // skip strings 217 if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){ 218 stringEndIndex = line.strEnd(i); 219 if (stringEndIndex == -1){ 220 compileErrors.append(CompileError(lineno, "string not closed")); 221 break; 222 } 223 } 224 // break at comments 225 if (line[i] == '#' && cast(integer)i > stringEndIndex){ 226 isDifferent(' ', token); 227 // add pending token 228 if (token.length){ 229 tokens.append(cast(string)token.dup); 230 token = []; 231 } 232 break; 233 } 234 // hand this line[i] to isDifferent 235 if (isDifferent(line[i], token)){ 236 tokens.append(cast(string)token.dup); 237 token = []; 238 } 239 } 240 isDifferent(' ', token); 241 if (token.length) 242 tokens.append(cast(string)token.dup); 243 tokenPerLine[lineno] = tokens.count - tokenCount; 244 tokenCount += tokenPerLine[lineno]; 245 } 246 // put them all in TokenList 247 TokenList r; 248 r.tokenPerLine = tokenPerLine; // no need to dup it 249 r.tokens = stringToTokens(tokens.toArray); 250 .destroy (tokens); 251 return r; 252 } 253 /// 254 unittest{ 255 string[] script = [ 256 "function void main{", 257 "\tint i = 5;", 258 "\t.5sdfdf = (!5 - 5);", 259 "\ta.b.c = @a;", 260 "\ta = 5.5;", 261 " a = -20+5;", 262 " a=-20+5;", 263 " a == -b;", 264 "a <= b;", 265 "a > b", 266 ]; 267 Token[] tokens = separateTokens(script).tokens; 268 string[] strTokens; 269 strTokens.length = tokens.length; 270 foreach (i, tok; tokens){ 271 strTokens[i] = tok.token; 272 } 273 /*import std.stdio : writeln; 274 foreach(token; strTokens) 275 writeln(token);*/ 276 assert (strTokens == [ 277 "function", "void", "main", "{", 278 "int", "i", "=", "5", ";", 279 ".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";", 280 "a", ".", "b", ".", "c", "=", "@", "a", ";", 281 "a", "=", "5.5", ";", 282 "a", "=", "-20", "+", "5", ";", 283 "a", "=", "-20", "+", "5", ";", 284 "a", "==", "-", "b", ";", 285 "a", "<=", "b", ";", 286 "a", ">", "b" 287 ]); 288 } 289 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type 290 /// in an array 291 /// 292 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n 293 /// `errors` is the array to which erors will be put 294 /// 295 /// As a plus, it also checks if the brackets are in correct order (and properly closed) 296 package TokenList toTokens(string[] script, ref CompileError[] errors){ 297 compileErrors = new LinkedList!CompileError; 298 /// Returns true if a string has chars that only identifiers can have 299 TokenList tokens = separateTokens(script); 300 if (tokens.tokens == null || tokens.tokens.length == 0){ 301 // there's error 302 errors = compileErrors.toArray; 303 .destroy(compileErrors); 304 return tokens; 305 }else{ 306 // continue with identiying tokens 307 // fill in tokens with tokenStrings' strings, and identify their type 308 foreach(i, token; tokens.tokens){ 309 try{ 310 tokens.tokens[i].type = getTokenType(token.token); 311 }catch(Exception e){ 312 compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg)); 313 } 314 } 315 // now check brackets 316 tokens.checkBrackets(compileErrors); 317 if (compileErrors.count > 0){ 318 errors = compileErrors.toArray; 319 } 320 .destroy(compileErrors); 321 return tokens; 322 } 323 }