1 /++
2 For reading a script into tokens
3 +/
4 module qscript.compiler.tokengen;
5 
6 import utils.misc;
7 import utils.lists;
8 import std.conv:to;
9 import qscript.compiler.misc;
10 
11 /// stores errors for tokengen
12 private LinkedList!CompileError compileErrors;
13 
14 /// Attempts to identify a token type by the token (string).
15 /// returns token type, if fails, throws exception
16 private Token.Type getTokenType(string token){
17 	/// Returns true if a string is a keyword
18 	bool isKeyword(string s){
19 		return KEYWORDS.hasElement(s);
20 	}
21 	/// Returns true if a string is an identifier
22 	bool isIdentifier(string s){
23 		// token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement
24 		if (!token.isNum && !isKeyword(token)){
25 			return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS);
26 		}else{
27 			return false;
28 		}
29 	}
30 	/// Returns true is a string is an operator
31 	bool isOperator(string s){
32 		return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s);
33 	}
34 	/// Returns true if string contains an integer
35 	bool isInt(string s){
36 		return isNum(s, false);
37 	}
38 	/// Returns true if a string contains a double
39 	/// 
40 	/// to be identified as a double, the number must have a decimal point in it
41 	bool isDouble(string s){
42 		return isNum(s, true);
43 	}
44 	if (token == "."){
45 		return Token.Type.MemberSelector;
46 	}else if (token == "="){
47 		return Token.Type.AssignmentOperator;
48 	}else if (isInt(token)){
49 		return Token.Type.Integer;
50 	}else if (isDouble(token)){
51 		return Token.Type.Double;
52 	}else if (DATA_TYPES.hasElement(token)){
53 		return Token.Type.DataType;
54 	}else if (isKeyword(token)){
55 		return Token.Type.Keyword;
56 	}else if (isIdentifier(token)){
57 		return Token.Type.Identifier;
58 	}else if (isOperator(token)){
59 		return Token.Type.Operator;
60 	}else if (token[0] == '"'){
61 		return Token.Type.String;
62 	}else if (token == ";"){
63 		return Token.Type.StatementEnd;
64 	}else if (token == ","){
65 		return Token.Type.Comma;
66 	}else if (token == "("){
67 		return Token.Type.ParanthesesOpen;
68 	}else if (token == ")"){
69 		return Token.Type.ParanthesesClose;
70 	}else if (token == "["){
71 		return Token.Type.IndexBracketOpen;
72 	}else if (token == "]"){
73 		return Token.Type.IndexBracketClose;
74 	}else if (token == "{"){
75 		return Token.Type.BlockStart;
76 	}else if (token == "}"){
77 		return Token.Type.BlockEnd;
78 	}else{
79 		throw new Exception("unidentified token type");
80 	}
81 }
82 ///
83 unittest{
84 	assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier);
85 	assert("24.5".getTokenType == Token.Type.Double);
86 	assert("245".getTokenType == Token.Type.Integer);
87 	assert("\"This is a string\"".getTokenType == Token.Type.String);
88 	assert("==".getTokenType == Token.Type.Operator);
89 	assert(";".getTokenType == Token.Type.StatementEnd);
90 	assert(",".getTokenType == Token.Type.Comma);
91 	assert("int".getTokenType == Token.Type.DataType);
92 	assert("double".getTokenType == Token.Type.DataType);
93 	assert("string".getTokenType == Token.Type.DataType);
94 	assert("function".getTokenType == Token.Type.Keyword);
95 	assert("if".getTokenType == Token.Type.Keyword);
96 	assert("while".getTokenType == Token.Type.Keyword);
97 	assert("else".getTokenType == Token.Type.Keyword);
98 	assert(".".getTokenType == Token.Type.MemberSelector);
99 }
100 
101 /// returns Token[] with type identified based on string[] input
102 package Token[] stringToTokens(string[] s){
103 	Token[] r;
104 	r.length = s.length;
105 	foreach (i, token; s){
106 		r[i].type = getTokenType(s[i]);
107 		r[i].token = s[i].dup;
108 	}
109 	return r;
110 }
111 
112 /// Reads script, and separates tokens
113 private TokenList separateTokens(string[] script){
114 	enum CharType{
115 		Bracket, /// any bracket
116 		Operator, /// any char that can be a part of a operator
117 		Semicolon, /// semicolon
118 		Comma, /// a comma
119 		Ident /// including the ones for keywords
120 	}
121 	static CharType getCharType(char c, char prev = 0x00){
122 		if (c == ';'){
123 			return CharType.Semicolon;
124 		}
125 		if (c == ','){
126 			return CharType.Comma;
127 		}
128 		if (['(','[','{','}',']',')'].hasElement(c)){
129 			return CharType.Bracket;
130 		}
131 		if (c == '.'){
132 			if (prev == 0x00 || !(cast(string)[prev]).isNum(false)){
133 				return CharType.Operator;
134 			}
135 			return CharType.Ident;
136 		}
137 		if (isAlphabet(cast(string)[c]) || isNum(cast(string)[c])){
138 			return CharType.Ident;
139 		}
140 		foreach (operator; OPERATORS~SOPERATORS){
141 			foreach (opChar; operator){
142 				if (c == opChar){
143 					return CharType.Operator;
144 				}
145 			}
146 		}
147 		throw new Exception ("unexpected char, '"~c~'\'');
148 	}
149 	LinkedList!string tokens = new LinkedList!string;
150 	uinteger[] tokenPerLine;
151 	tokenPerLine.length = script.length;
152 	uinteger tokenCount = 0;
153 	foreach (lineno, line; script){
154 		CharType prevType = CharType.Ident, currentType = CharType.Ident;
155 		for (uinteger i = 0, readFrom = 0, lastInd = line.length-1; i < line.length; i ++){
156 			// skip strings
157 			if (line[i] == '"'){
158 				if (readFrom != i){
159 					if (readFrom < i){
160 						// add the previous token
161 						tokens.append(line[readFrom .. i]);
162 						readFrom = i;
163 					}else{
164 						compileErrors.append (CompileError(lineno, "unexpected string"));
165 					}
166 				}
167 				integer end = line.strEnd(i);
168 				if (end == -1){
169 					compileErrors.append(CompileError(lineno, "string not closed"));
170 					break;
171 				}
172 				// append the string
173 				tokens.append(line[readFrom .. end+1]);
174 				readFrom = end+1;
175 				i = end;
176 				continue;
177 			}
178 			// break at comments
179 			if (line[i] == '#' || line[i] == ' ' || line[i] == '\t'){
180 				// add a token if remaining
181 				if (readFrom < i){
182 					tokens.append (line[readFrom .. i]);
183 				}
184 				readFrom = i+1;
185 				if (line[i] == '#'){
186 					break;
187 				}
188 				continue;
189 			}
190 			// add other types of tokens
191 			try{
192 				currentType = getCharType(line[i], i > 0 ? line[i-1] : 0x00);
193 			}catch (Exception e){
194 				compileErrors.append (CompileError(lineno, e.msg));
195 				.destroy (e);
196 				break;
197 			}
198 			if (currentType != prevType || currentType == CharType.Bracket || currentType == CharType.Semicolon ||
199 				currentType == CharType.Comma){
200 				if (readFrom < i){
201 					tokens.append (line[readFrom .. i]);
202 					readFrom = i;
203 				}
204 				if (currentType == CharType.Bracket || currentType == CharType.Semicolon || currentType == CharType.Comma){
205 					tokens.append (cast(string)[line[i]]);
206 					readFrom = i+1;
207 				}
208 			}
209 			prevType = currentType;
210 			// add if is at end of line
211 			if (i == lastInd && readFrom <= i){
212 				tokens.append (line[readFrom .. i+1]);
213 			}
214 		}
215 		tokenPerLine[lineno] = tokens.count - tokenCount;
216 		tokenCount += tokenPerLine[lineno];
217 	}
218 	// put them all in TokenList
219 	TokenList r;
220 	r.tokenPerLine = tokenPerLine; // no need to dup it
221 	r.tokens = stringToTokens(tokens.toArray);
222 	.destroy (tokens);
223 	return r;
224 }
225 ///
226 unittest{
227 	string[] script = [
228 		"function void main{",
229 		"\tint i = 5;",
230 		"\t.5sdfdf = (!5 - 5);",
231 		"\ta.b.c = @a;",
232 		"\ta = 5.5;"
233 	];
234 	Token[] tokens = separateTokens(script).tokens;
235 	string[] strTokens;
236 	strTokens.length = tokens.length;
237 	foreach (i, tok; tokens){
238 		strTokens[i] = tok.token;
239 	}
240 	assert (strTokens == [
241 			"function", "void", "main", "{",
242 			"int", "i", "=", "5", ";",
243 			".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";",
244 			"a", ".", "b", ".", "c", "=", "@", "a", ";",
245 			"a", "=", "5.5", ";"
246 		]);
247 }
248 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type
249 /// in an array
250 /// 
251 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n
252 /// `errors` is the array to which erors will be put
253 /// 
254 /// As a plus, it also checks if the brackets are in correct order (and properly closed)
255 package TokenList toTokens(string[] script, ref CompileError[] errors){
256 	compileErrors = new LinkedList!CompileError;
257 	/// Returns true if a string has chars that only identifiers can have
258 	TokenList tokens = separateTokens(script);
259 	if (tokens.tokens == null || tokens.tokens.length == 0){
260 		// there's error
261 		errors = compileErrors.toArray;
262 		.destroy(compileErrors);
263 		return tokens;
264 	}else{
265 		// continue with identiying tokens
266 		// fill in tokens with tokenStrings' strings, and identify their type
267 		foreach(i, token; tokens.tokens){
268 			try{
269 				tokens.tokens[i].type = getTokenType(token.token);
270 			}catch(Exception e){
271 				compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg));
272 			}
273 		}
274 		// now check brackets
275 		tokens.checkBrackets(compileErrors);
276 		if (compileErrors.count > 0){
277 			errors = compileErrors.toArray;
278 		}
279 		.destroy(compileErrors);
280 		return tokens;
281 	}
282 }