1 /++ 2 For reading a script into tokens 3 +/ 4 module qscript.compiler.tokengen; 5 6 import utils.misc; 7 import utils.lists; 8 import std.conv:to; 9 import qscript.compiler.misc; 10 11 /// stores errors for tokengen 12 private LinkedList!CompileError compileErrors; 13 14 /// Attempts to identify a token type by the token (string). 15 /// returns token type, if fails, throws exception 16 private Token.Type getTokenType(string token){ 17 /// Returns true if a string is a keyword 18 bool isKeyword(string s){ 19 return KEYWORDS.hasElement(s); 20 } 21 /// Returns true if a string is an identifier 22 bool isIdentifier(string s){ 23 // token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement 24 if (!token.isNum && !isKeyword(token)){ 25 return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS); 26 }else{ 27 return false; 28 } 29 } 30 /// Returns true is a string is an operator 31 bool isOperator(string s){ 32 return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s); 33 } 34 /// Returns true if string contains an integer 35 bool isInt(string s){ 36 return isNum(s, false); 37 } 38 /// Returns true if a string contains a double 39 /// 40 /// to be identified as a double, the number must have a decimal point in it 41 bool isDouble(string s){ 42 return isNum(s, true); 43 } 44 if (token == "."){ 45 return Token.Type.MemberSelector; 46 }else if (token == "="){ 47 return Token.Type.AssignmentOperator; 48 }else if (isInt(token)){ 49 return Token.Type.Integer; 50 }else if (isDouble(token)){ 51 return Token.Type.Double; 52 }else if (DATA_TYPES.hasElement(token)){ 53 return Token.Type.DataType; 54 }else if (isKeyword(token)){ 55 return Token.Type.Keyword; 56 }else if (isIdentifier(token)){ 57 return Token.Type.Identifier; 58 }else if (isOperator(token)){ 59 return Token.Type.Operator; 60 }else if (token[0] == '"'){ 61 return Token.Type.String; 62 }else if (token == ";"){ 63 return Token.Type.StatementEnd; 64 }else if (token == ","){ 65 return Token.Type.Comma; 66 }else if (token == "("){ 67 return Token.Type.ParanthesesOpen; 68 }else if (token == ")"){ 69 return Token.Type.ParanthesesClose; 70 }else if (token == "["){ 71 return Token.Type.IndexBracketOpen; 72 }else if (token == "]"){ 73 return Token.Type.IndexBracketClose; 74 }else if (token == "{"){ 75 return Token.Type.BlockStart; 76 }else if (token == "}"){ 77 return Token.Type.BlockEnd; 78 }else{ 79 throw new Exception("unidentified token type"); 80 } 81 } 82 /// 83 unittest{ 84 assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier); 85 assert("24.5".getTokenType == Token.Type.Double); 86 assert("245".getTokenType == Token.Type.Integer); 87 assert("\"This is a string\"".getTokenType == Token.Type.String); 88 assert("==".getTokenType == Token.Type.Operator); 89 assert(";".getTokenType == Token.Type.StatementEnd); 90 assert(",".getTokenType == Token.Type.Comma); 91 assert("int".getTokenType == Token.Type.DataType); 92 assert("double".getTokenType == Token.Type.DataType); 93 assert("string".getTokenType == Token.Type.DataType); 94 assert("function".getTokenType == Token.Type.Keyword); 95 assert("if".getTokenType == Token.Type.Keyword); 96 assert("while".getTokenType == Token.Type.Keyword); 97 assert("else".getTokenType == Token.Type.Keyword); 98 assert(".".getTokenType == Token.Type.MemberSelector); 99 } 100 101 /// returns Token[] with type identified based on string[] input 102 package Token[] stringToTokens(string[] s){ 103 Token[] r; 104 r.length = s.length; 105 foreach (i, token; s){ 106 r[i].type = getTokenType(s[i]); 107 r[i].token = s[i].dup; 108 } 109 return r; 110 } 111 112 /// Reads script, and separates tokens 113 private TokenList separateTokens(string[] script){ 114 enum CharType{ 115 Bracket, /// any bracket 116 Operator, /// any char that can be a part of a operator 117 Semicolon, /// semicolon 118 Comma, /// a comma 119 Ident /// including the ones for keywords 120 } 121 static CharType getCharType(char c, char prev = 0x00){ 122 if (c == ';'){ 123 return CharType.Semicolon; 124 } 125 if (c == ','){ 126 return CharType.Comma; 127 } 128 if (['(','[','{','}',']',')'].hasElement(c)){ 129 return CharType.Bracket; 130 } 131 if (c == '.'){ 132 if (prev == 0x00 || !(cast(string)[prev]).isNum(false)){ 133 return CharType.Operator; 134 } 135 return CharType.Ident; 136 } 137 if (isAlphabet(cast(string)[c]) || isNum(cast(string)[c])){ 138 return CharType.Ident; 139 } 140 foreach (operator; OPERATORS~SOPERATORS){ 141 foreach (opChar; operator){ 142 if (c == opChar){ 143 return CharType.Operator; 144 } 145 } 146 } 147 throw new Exception ("unexpected char, '"~c~'\''); 148 } 149 LinkedList!string tokens = new LinkedList!string; 150 uinteger[] tokenPerLine; 151 tokenPerLine.length = script.length; 152 uinteger tokenCount = 0; 153 foreach (lineno, line; script){ 154 CharType prevType = CharType.Ident, currentType = CharType.Ident; 155 for (uinteger i = 0, readFrom = 0, lastInd = line.length-1; i < line.length; i ++){ 156 // skip strings 157 if (line[i] == '"'){ 158 if (readFrom != i){ 159 if (readFrom < i){ 160 // add the previous token 161 tokens.append(line[readFrom .. i]); 162 readFrom = i; 163 }else{ 164 compileErrors.append (CompileError(lineno, "unexpected string")); 165 } 166 } 167 integer end = line.strEnd(i); 168 if (end == -1){ 169 compileErrors.append(CompileError(lineno, "string not closed")); 170 break; 171 } 172 // append the string 173 tokens.append(line[readFrom .. end+1]); 174 readFrom = end+1; 175 i = end; 176 continue; 177 } 178 // break at comments 179 if (line[i] == '#' || line[i] == ' ' || line[i] == '\t'){ 180 // add a token if remaining 181 if (readFrom < i){ 182 tokens.append (line[readFrom .. i]); 183 } 184 readFrom = i+1; 185 if (line[i] == '#'){ 186 break; 187 } 188 continue; 189 } 190 // add other types of tokens 191 try{ 192 currentType = getCharType(line[i], i > 0 ? line[i-1] : 0x00); 193 }catch (Exception e){ 194 compileErrors.append (CompileError(lineno, e.msg)); 195 .destroy (e); 196 break; 197 } 198 if (currentType != prevType || currentType == CharType.Bracket || currentType == CharType.Semicolon || 199 currentType == CharType.Comma){ 200 if (readFrom < i){ 201 tokens.append (line[readFrom .. i]); 202 readFrom = i; 203 } 204 if (currentType == CharType.Bracket || currentType == CharType.Semicolon || currentType == CharType.Comma){ 205 tokens.append (cast(string)[line[i]]); 206 readFrom = i+1; 207 } 208 } 209 prevType = currentType; 210 // add if is at end of line 211 if (i == lastInd && readFrom <= i){ 212 tokens.append (line[readFrom .. i+1]); 213 } 214 } 215 tokenPerLine[lineno] = tokens.count - tokenCount; 216 tokenCount += tokenPerLine[lineno]; 217 } 218 // put them all in TokenList 219 TokenList r; 220 r.tokenPerLine = tokenPerLine; // no need to dup it 221 r.tokens = stringToTokens(tokens.toArray); 222 .destroy (tokens); 223 return r; 224 } 225 /// 226 unittest{ 227 string[] script = [ 228 "function void main{", 229 "\tint i = 5;", 230 "\t.5sdfdf = (!5 - 5);", 231 "\ta.b.c = @a;", 232 "\ta = 5.5;" 233 ]; 234 Token[] tokens = separateTokens(script).tokens; 235 string[] strTokens; 236 strTokens.length = tokens.length; 237 foreach (i, tok; tokens){ 238 strTokens[i] = tok.token; 239 } 240 assert (strTokens == [ 241 "function", "void", "main", "{", 242 "int", "i", "=", "5", ";", 243 ".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";", 244 "a", ".", "b", ".", "c", "=", "@", "a", ";", 245 "a", "=", "5.5", ";" 246 ]); 247 } 248 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type 249 /// in an array 250 /// 251 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n 252 /// `errors` is the array to which erors will be put 253 /// 254 /// As a plus, it also checks if the brackets are in correct order (and properly closed) 255 package TokenList toTokens(string[] script, ref CompileError[] errors){ 256 compileErrors = new LinkedList!CompileError; 257 /// Returns true if a string has chars that only identifiers can have 258 TokenList tokens = separateTokens(script); 259 if (tokens.tokens == null || tokens.tokens.length == 0){ 260 // there's error 261 errors = compileErrors.toArray; 262 .destroy(compileErrors); 263 return tokens; 264 }else{ 265 // continue with identiying tokens 266 // fill in tokens with tokenStrings' strings, and identify their type 267 foreach(i, token; tokens.tokens){ 268 try{ 269 tokens.tokens[i].type = getTokenType(token.token); 270 }catch(Exception e){ 271 compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg)); 272 } 273 } 274 // now check brackets 275 tokens.checkBrackets(compileErrors); 276 if (compileErrors.count > 0){ 277 errors = compileErrors.toArray; 278 } 279 .destroy(compileErrors); 280 return tokens; 281 } 282 }