1 /++
2 For reading a script into tokens
3 +/
4 module qscript.compiler.tokengen;
5 
6 import utils.misc;
7 import utils.lists;
8 import std.conv:to;
9 import qscript.compiler.misc;
10 
11 /// stores errors for tokengen
12 private LinkedList!CompileError compileErrors;
13 
14 /// Attempts to identify a token type by the token (string).
15 /// returns token type, if fails, throws exception
16 private Token.Type getTokenType(string token){
17 	/// Returns true if a string is a keyword
18 	bool isKeyword(string s){
19 		return KEYWORDS.hasElement(s);
20 	}
21 	/// Returns true if a string is an identifier
22 	bool isIdentifier(string s){
23 		// token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement
24 		if (!token.isNum && !isKeyword(token)){
25 			return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS);
26 		}else{
27 			return false;
28 		}
29 	}
30 	/// Returns true is a string is an operator
31 	bool isOperator(string s){
32 		return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s);
33 	}
34 	/// Returns true if string contains an integer
35 	bool isInt(string s){
36 		return isNum(s, false);
37 	}
38 	/// Returns true if a string contains a double
39 	/// 
40 	/// to be identified as a double, the number must have a decimal point in it
41 	bool isDouble(string s){
42 		return isNum(s, true);
43 	}
44 	if (token == "."){
45 		return Token.Type.MemberSelector;
46 	}else if (token == "="){
47 		return Token.Type.AssignmentOperator;
48 	}else if (isInt(token)){
49 		return Token.Type.Integer;
50 	}else if (isDouble(token)){
51 		return Token.Type.Double;
52 	}else if (DATA_TYPES.hasElement(token)){
53 		return Token.Type.DataType;
54 	}else if (isKeyword(token)){
55 		return Token.Type.Keyword;
56 	}else if (isIdentifier(token)){
57 		return Token.Type.Identifier;
58 	}else if (isOperator(token)){
59 		return Token.Type.Operator;
60 	}else if (token[0] == '"'){
61 		return Token.Type.String;
62 	}else if (token[0] == '\''){
63 		if (token.length < 3)
64 			throw new Exception("no character provided inside ''");
65 		if (decodeString(token[1 .. $-1]).length > 1)
66 			throw new Exception("'' can only hold 1 character");
67 		return Token.Type.Char;
68 	}else if (token == ";"){
69 		return Token.Type.StatementEnd;
70 	}else if (token == ","){
71 		return Token.Type.Comma;
72 	}else if (token == "("){
73 		return Token.Type.ParanthesesOpen;
74 	}else if (token == ")"){
75 		return Token.Type.ParanthesesClose;
76 	}else if (token == "["){
77 		return Token.Type.IndexBracketOpen;
78 	}else if (token == "]"){
79 		return Token.Type.IndexBracketClose;
80 	}else if (token == "{"){
81 		return Token.Type.BlockStart;
82 	}else if (token == "}"){
83 		return Token.Type.BlockEnd;
84 	}else{
85 		throw new Exception("unidentified token type '"~token~'\'');
86 	}
87 }
88 ///
89 unittest{
90 	assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier);
91 	assert("24.5".getTokenType == Token.Type.Double);
92 	assert("245".getTokenType == Token.Type.Integer);
93 	assert("\"This is a string\"".getTokenType == Token.Type.String);
94 	assert("==".getTokenType == Token.Type.Operator);
95 	assert(";".getTokenType == Token.Type.StatementEnd);
96 	assert(",".getTokenType == Token.Type.Comma);
97 	assert("int".getTokenType == Token.Type.DataType);
98 	assert("double".getTokenType == Token.Type.DataType);
99 	assert("char".getTokenType == Token.Type.DataType);
100 	assert("function".getTokenType == Token.Type.Keyword);
101 	assert("if".getTokenType == Token.Type.Keyword);
102 	assert("while".getTokenType == Token.Type.Keyword);
103 	assert("else".getTokenType == Token.Type.Keyword);
104 	assert(".".getTokenType == Token.Type.MemberSelector);
105 	assert("\'p\'".getTokenType == Token.Type.Char);
106 }
107 
108 /// returns Token[] with type identified based on string[] input
109 package Token[] stringToTokens(string[] s){
110 	Token[] r;
111 	r.length = s.length;
112 	foreach (i, token; s){
113 		//r[i].type = getTokenType(s[i]);
114 		r[i].token = s[i].dup;
115 	}
116 	return r;
117 }
118 
119 /// Reads script, and separates tokens
120 private TokenList separateTokens(string[] script){
121 	static bool isDifferent(char c, ref char[] token){
122 		static const SEPERATORS = ['(','[','{','}',']',')', ';', ','];
123 		static const WHITESPACE = [' ', '\t'];
124 		static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number
125 		static char pendingTokenChar = 0; /// as the name says...
126 		if (pendingTokenChar != 0){
127 			token = [pendingTokenChar];
128 			pendingTokenChar = 0;
129 			if (SEPERATORS.hasElement(token[0])){
130 				if (!WHITESPACE.hasElement(c))
131 					pendingTokenChar = c;
132 				lastToken = token.dup;
133 				return true;
134 			}
135 		}
136 		if (token.length && ['"', '\''].hasElement(token[0])){
137 			token = token ~ c;
138 			if (c == token[0] && token[$-1] != '\\'){
139 				lastToken = token.dup;
140 				return true;
141 			}
142 			return false;
143 		}
144 		if (WHITESPACE.hasElement(c)){
145 			if (token.length > 0){
146 				lastToken = token.dup;
147 				return true;
148 			}
149 			return false;
150 		}
151 		if (SEPERATORS.hasElement(c)){
152 			if (token.length == 0){
153 				token = [c];
154 				lastToken = token.dup;
155 				return true;
156 			}
157 			pendingTokenChar = c;
158 			lastToken = token.dup;
159 			return true;
160 		}
161 		if (token.length > 0){
162 			// strings
163 			if (token[0] == '\"' || token[0] == '\''){
164 				token = token ~ c;
165 				if (c == token[0] && token[$-1] != '\\'){
166 					lastToken = token.dup;
167 					return true;
168 				}
169 				return false;
170 			}
171 			// unexpected strings get read as separate tokens
172 			if ((c == '\"' || c == '\'') && token[0] != c){
173 				pendingTokenChar = c;
174 				lastToken = token.dup;
175 				return true;
176 			}
177 			// space
178 			if (c == ' ' || c == '\t'){
179 				lastToken = token.dup;
180 				return true;
181 			}
182 			// - is operator or part of number
183 			if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){
184 				token = token ~ c;
185 				// go on
186 				return false;
187 			}
188 			// . is memberSelector or decimal place
189 			if (c == '.' && !isNum(cast(string)token, false)){
190 				lastToken = token;
191 				pendingTokenChar = c;
192 				return true;
193 			}
194 			// token is operator
195 			if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){
196 				// see if it's still operator after adding c
197 				if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){
198 					// go on
199 					token = token ~ c;
200 					return false;
201 				}else{
202 					pendingTokenChar = c;
203 					lastToken = token.dup;
204 					return true;
205 				}
206 			}else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){
207 				// token not operator, c is operator
208 				pendingTokenChar = c;
209 				lastToken = token.dup;
210 				return true;
211 			}
212 		}
213 		// nothing else matches, just add it to end
214 		token = token ~ c;
215 		return false;
216 	}
217 	LinkedList!string tokens = new LinkedList!string;
218 	uinteger[] tokenPerLine;
219 	tokenPerLine.length = script.length;
220 	uinteger tokenCount = 0;
221 	foreach (lineno, line; script){
222 		integer stringEndIndex = -1;
223 		char[] token = [];
224 		for (uinteger i = 0; i < line.length; i ++){
225 			// skip strings
226 			if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){
227 				stringEndIndex = line.strEnd(i);
228 				if (stringEndIndex == -1){
229 					compileErrors.append(CompileError(lineno, "string not closed"));
230 					break;
231 				}
232 			}
233 			// break at comments
234 			if (line[i] == '#' && cast(integer)i > stringEndIndex){
235 				isDifferent(' ', token);
236 				// add pending token
237 				if (token.length){
238 					tokens.append(cast(string)token.dup);
239 					token = [];
240 				}
241 				break;
242 			}
243 			// hand this line[i] to isDifferent
244 			if (isDifferent(line[i], token)){
245 				tokens.append(cast(string)token.dup);
246 				token = [];
247 			}
248 		}
249 		isDifferent(' ', token);
250 		if (token.length)
251 			tokens.append(cast(string)token.dup);
252 		tokenPerLine[lineno] = tokens.count - tokenCount;
253 		tokenCount += tokenPerLine[lineno];
254 	}
255 	// put them all in TokenList
256 	TokenList r;
257 	r.tokenPerLine = tokenPerLine; // no need to dup it
258 	r.tokens = stringToTokens(tokens.toArray);
259 	.destroy (tokens);
260 	return r;
261 }
262 ///
263 unittest{
264 	string[] script = [
265 		"function void main{",
266 		"\tint i = 5;",
267 		"\t.5sdfdf = (!5 - 5);",
268 		"\ta.b.c = @a;",
269 		"\ta = 5.5;",
270 		" a = -20+5;",
271 		" a=-20+5;",
272 		" a == -b;",
273 		"a <= b;",
274 		"a > b",
275 	];
276 	Token[] tokens = separateTokens(script).tokens;
277 	string[] strTokens;
278 	strTokens.length = tokens.length;
279 	foreach (i, tok; tokens){
280 		strTokens[i] = tok.token;
281 	}
282 	/*import std.stdio : writeln;
283 	foreach(token; strTokens)
284 		writeln(token);*/
285 	assert (strTokens == [
286 			"function", "void", "main", "{",
287 			"int", "i", "=", "5", ";",
288 			".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";",
289 			"a", ".", "b", ".", "c", "=", "@", "a", ";",
290 			"a", "=", "5.5", ";",
291 			"a", "=", "-20", "+", "5", ";",
292 			"a", "=", "-20", "+", "5", ";",
293 			"a", "==", "-", "b", ";",
294 			"a", "<=", "b", ";",
295 			"a", ">", "b"
296 		]);
297 }
298 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type
299 /// in an array
300 /// 
301 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n
302 /// `errors` is the array to which erors will be put
303 /// 
304 /// As a plus, it also checks if the brackets are in correct order (and properly closed)
305 package TokenList toTokens(string[] script, ref CompileError[] errors){
306 	compileErrors = new LinkedList!CompileError;
307 	/// Returns true if a string has chars that only identifiers can have
308 	TokenList tokens = separateTokens(script);
309 	if (tokens.tokens == null || tokens.tokens.length == 0){
310 		// there's error
311 		errors = compileErrors.toArray;
312 		.destroy(compileErrors);
313 		return tokens;
314 	}else{
315 		// continue with identiying tokens
316 		// fill in tokens with tokenStrings' strings, and identify their type
317 		foreach(i, token; tokens.tokens){
318 			try{
319 				tokens.tokens[i].type = getTokenType(token.token);
320 			}catch(Exception e){
321 				compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg));
322 			}
323 		}
324 		// now check brackets
325 		tokens.checkBrackets(compileErrors);
326 		if (compileErrors.count > 0){
327 			errors = compileErrors.toArray;
328 		}
329 		.destroy(compileErrors);
330 		return tokens;
331 	}
332 }