1 /++
2 For reading a script into tokens
3 +/
4 module qscript.compiler.tokengen;
5 
6 import utils.misc;
7 import utils.lists;
8 import std.conv:to;
9 import qscript.compiler.compiler;
10 
11 /// Each token is stored as a `Token` with the type and the actual token
12 package struct Token{
13 	/// Specifies type of token
14 	/// 
15 	/// used only in `compiler.tokengen`
16 	enum Type{
17 		String,/// That the token is: `"SOME STRING"`
18 		Char, /// That the token is: `'C'` # Some character
19 		Integer,/// That the token an int (or uint, or ubyte)
20 		Double, /// That the token is a double (floating point) value
21 		Bool, /// a true or false value
22 		Identifier,/// That the token is an identifier. i.e token is a variable name or a function name.  For a token to be marked as Identifier, it doesn't need to be defined in `new()`
23 		DataType, /// the  token is a data type
24 		MemberSelector, /// a member selector operator
25 		AssignmentOperator, /// and assignmentOperator
26 		Operator,/// That the token is an operator, like `+`, `==` etc
27 		Keyword,/// A `function` or `var` ...
28 		Comma,/// That its a comma: `,`
29 		StatementEnd,/// A semicolon
30 		ParanthesesOpen,/// `(`
31 		ParanthesesClose,/// `)`
32 		IndexBracketOpen,/// `[`
33 		IndexBracketClose,///`]`
34 		BlockStart,///`{`
35 		BlockEnd,///`}`
36 	}
37 	Type type;/// type of token
38 	string token;/// token
39 	this(Type tType, string tToken){
40 		type = tType;
41 		token = tToken;
42 	}
43 	/// Identifies token type itself
44 	/// 
45 	/// Throws: Exception if token invalid
46 	this(string tToken){
47 		token = tToken;
48 		type = getTokenType(token);
49 	}
50 }
51 
52 /// To store Tokens with Types where the line number of each token is required
53 package struct TokenList{
54 	Token[] tokens; /// Stores the tokens
55 	uinteger[] tokenPerLine; /// Stores the number of tokens in each line
56 	/// Returns the line number a token is in by usin the index of the token in `tokens`
57 	/// 
58 	/// The returned value is the line-number, not index, it starts from 1, not zero
59 	uinteger getTokenLine(uinteger tokenIndex){
60 		uinteger i = 0, chars = 0;
61 		tokenIndex ++;
62 		for (; chars < tokenIndex && i < tokenPerLine.length; i ++){
63 			chars += tokenPerLine[i];
64 		}
65 		return i;
66 	}
67 	/// reads tokens into a string
68 	static string toString(Token[] t){
69 		char[] r;
70 		// set length
71 		uinteger length = 0;
72 		for (uinteger i = 0; i < t.length; i ++){
73 			length += t[i].token.length;
74 		}
75 		r.length = length;
76 		// read em all into r;
77 		for (uinteger i = 0, writeTo = 0; i < t.length; i ++){
78 			r[writeTo .. writeTo + t[i].token.length] = t[i].token;
79 			writeTo += t[i].token.length;
80 		}
81 		return cast(string)r;
82 	}
83 }
84 
85 /// Attempts to identify a token type by the token (string).
86 /// returns token type, if fails, throws exception
87 private Token.Type getTokenType(string token){
88 	/// Returns true if a string is a keyword
89 	bool isKeyword(string s){
90 		return KEYWORDS.hasElement(s);
91 	}
92 	/// Returns true if a string is an identifier
93 	bool isIdentifier(string s){
94 		// token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement
95 		if (!token.isNum && !isKeyword(token)){
96 			return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS);
97 		}else{
98 			return false;
99 		}
100 	}
101 	/// Returns true is a string is an operator
102 	bool isOperator(string s){
103 		return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s);
104 	}
105 	/// Returns true if string contains an integer
106 	bool isInt(string s){
107 		return isNum(s, false);
108 	}
109 	/// Returns true if a string contains a double
110 	/// 
111 	/// to be identified as a double, the number must have a decimal point in it
112 	bool isDouble(string s){
113 		return isNum(s, true);
114 	}
115 	if (token == "."){
116 		return Token.Type.MemberSelector;
117 	}else if (token == "="){
118 		return Token.Type.AssignmentOperator;
119 	}else if (isInt(token)){
120 		return Token.Type.Integer;
121 	}else if (isDouble(token)){
122 		return Token.Type.Double;
123 	}else if (token == "true" || token == "false"){
124 		return Token.Type.Bool;
125 	}else if (DATA_TYPES.hasElement(token)){
126 		return Token.Type.DataType;
127 	}else if (isKeyword(token)){
128 		return Token.Type.Keyword;
129 	}else if (isIdentifier(token)){
130 		return Token.Type.Identifier;
131 	}else if (isOperator(token)){
132 		return Token.Type.Operator;
133 	}else if (token[0] == '"'){
134 		return Token.Type.String;
135 	}else if (token[0] == '\''){
136 		if (token.length < 3)
137 			throw new Exception("no character provided inside ''");
138 		if (decodeString(token[1 .. $-1]).length > 1)
139 			throw new Exception("'' can only hold 1 character");
140 		return Token.Type.Char;
141 	}else if (token == ";"){
142 		return Token.Type.StatementEnd;
143 	}else if (token == ","){
144 		return Token.Type.Comma;
145 	}else if (token == "("){
146 		return Token.Type.ParanthesesOpen;
147 	}else if (token == ")"){
148 		return Token.Type.ParanthesesClose;
149 	}else if (token == "["){
150 		return Token.Type.IndexBracketOpen;
151 	}else if (token == "]"){
152 		return Token.Type.IndexBracketClose;
153 	}else if (token == "{"){
154 		return Token.Type.BlockStart;
155 	}else if (token == "}"){
156 		return Token.Type.BlockEnd;
157 	}else{
158 		throw new Exception("unidentified token type '"~token~'\'');
159 	}
160 }
161 ///
162 unittest{
163 	assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier);
164 	assert("24.5".getTokenType == Token.Type.Double);
165 	assert("245".getTokenType == Token.Type.Integer);
166 	assert("\"This is a string\"".getTokenType == Token.Type.String);
167 	assert("==".getTokenType == Token.Type.Operator);
168 	assert(";".getTokenType == Token.Type.StatementEnd);
169 	assert(",".getTokenType == Token.Type.Comma);
170 	assert("int".getTokenType == Token.Type.DataType);
171 	assert("double".getTokenType == Token.Type.DataType);
172 	assert("char".getTokenType == Token.Type.DataType);
173 	assert("function".getTokenType == Token.Type.Keyword);
174 	assert("if".getTokenType == Token.Type.Keyword);
175 	assert("while".getTokenType == Token.Type.Keyword);
176 	assert("else".getTokenType == Token.Type.Keyword);
177 	assert(".".getTokenType == Token.Type.MemberSelector);
178 	assert("\'p\'".getTokenType == Token.Type.Char);
179 	assert("null".getTokenType == Token.Type.Keyword);
180 	assert("true".getTokenType == Token.Type.Bool);
181 	assert("false".getTokenType == Token.Type.Bool);
182 }
183 
184 /// returns Token[] with type identified based on string[] input
185 private Token[] stringToTokens(string[] s, bool detectType = true){
186 	Token[] r;
187 	r.length = s.length;
188 	foreach (i, token; s){
189 		if (detectType)
190 			r[i].type = getTokenType(s[i]);
191 		r[i].token = s[i].dup;
192 	}
193 	return r;
194 }
195 
196 /// Reads script, and separates tokens
197 private TokenList separateTokens(string[] script, LinkedList!CompileError compileErrors){
198 	static bool isDifferent(char c, ref char[] token){
199 		static const SEPERATORS = ['(','[','{','}',']',')', ';', ','];
200 		static const WHITESPACE = [' ', '\t'];
201 		static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number
202 		static char pendingTokenChar = 0; /// as the name says...
203 		if (pendingTokenChar != 0){
204 			token = [pendingTokenChar];
205 			pendingTokenChar = 0;
206 			if (SEPERATORS.hasElement(token[0])){
207 				if (!WHITESPACE.hasElement(c))
208 					pendingTokenChar = c;
209 				lastToken = token.dup;
210 				return true;
211 			}
212 		}
213 		if (token.length && ['"', '\''].hasElement(token[0])){
214 			token = token ~ c;
215 			if (c == token[0] && token[$-2] != '\\'){
216 				lastToken = token.dup;
217 				return true;
218 			}
219 			return false;
220 		}
221 		if (WHITESPACE.hasElement(c)){
222 			if (token.length > 0){
223 				lastToken = token.dup;
224 				return true;
225 			}
226 			return false;
227 		}
228 		if (SEPERATORS.hasElement(c)){
229 			if (token.length == 0){
230 				token = [c];
231 				lastToken = token.dup;
232 				return true;
233 			}
234 			pendingTokenChar = c;
235 			lastToken = token.dup;
236 			return true;
237 		}
238 		if (token.length > 0){
239 			// strings
240 			if (token[0] == '"' || token[0] == '\''){
241 				token = token ~ c;
242 				if (c == token[0] && token[$-1] != '\\'){
243 					lastToken = token.dup;
244 					return true;
245 				}
246 				return false;
247 			}
248 			// unexpected strings get read as separate tokens
249 			if ((c == '"' || c == '\'') && token[0] != c){
250 				pendingTokenChar = c;
251 				lastToken = token.dup;
252 				return true;
253 			}
254 			// space
255 			if (c == ' ' || c == '\t'){
256 				lastToken = token.dup;
257 				return true;
258 			}
259 			// - is operator or part of number
260 			if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){
261 				token = token ~ c;
262 				// go on
263 				return false;
264 			}
265 			// . is memberSelector or decimal place
266 			if (c == '.' && !isNum(cast(string)token, false)){
267 				lastToken = token;
268 				pendingTokenChar = c;
269 				return true;
270 			}
271 			// token is operator
272 			if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){
273 				// see if it's still operator after adding c
274 				if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){
275 					// go on
276 					token = token ~ c;
277 					return false;
278 				}else{
279 					pendingTokenChar = c;
280 					lastToken = token.dup;
281 					return true;
282 				}
283 			}else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){
284 				// token not operator, c is operator
285 				pendingTokenChar = c;
286 				lastToken = token.dup;
287 				return true;
288 			}
289 		}
290 		// nothing else matches, just add it to end
291 		token = token ~ c;
292 		return false;
293 	}
294 	LinkedList!string tokens = new LinkedList!string;
295 	uinteger[] tokenPerLine;
296 	tokenPerLine.length = script.length;
297 	uinteger tokenCount = 0;
298 	foreach (lineno, line; script){
299 		integer stringEndIndex = -1;
300 		char[] token = [];
301 		for (uinteger i = 0; i < line.length; i ++){
302 			// skip strings
303 			if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){
304 				stringEndIndex = line.strEnd(i);
305 				if (stringEndIndex == -1){
306 					compileErrors.append(CompileError(lineno, "string not closed"));
307 					break;
308 				}
309 			}
310 			// break at comments
311 			if (line[i] == '#' && cast(integer)i > stringEndIndex){
312 				isDifferent(' ', token);
313 				// add pending token
314 				if (token.length){
315 					tokens.append(cast(string)token.dup);
316 					token = [];
317 				}
318 				break;
319 			}
320 			// hand this line[i] to isDifferent
321 			if (isDifferent(line[i], token)){
322 				tokens.append(cast(string)token.dup);
323 				token = [];
324 			}
325 		}
326 		isDifferent(' ', token);
327 		if (token.length)
328 			tokens.append(cast(string)token.dup);
329 		tokenPerLine[lineno] = tokens.count - tokenCount;
330 		tokenCount += tokenPerLine[lineno];
331 	}
332 	// put them all in TokenList
333 	TokenList r;
334 	r.tokenPerLine = tokenPerLine; // no need to dup it
335 	r.tokens = stringToTokens(tokens.toArray, false);
336 	.destroy (tokens);
337 	return r;
338 }
339 ///
340 unittest{
341 	string[] script = [
342 		"function void main{",
343 		"\tint i = 5;",
344 		"\t.5sdfdf = (!5 - 5);",
345 		"\ta.b.c = @a;",
346 		"\ta = 5.5;",
347 		" a = -20+5;",
348 		" a=-20+5;",
349 		" a == -b;",
350 		"a <= b;",
351 		"a > b",
352 		"\"string\twith white\\\"Space\"\"unexpected string\""
353 	];
354 	LinkedList!CompileError err = new LinkedList!CompileError;
355 	Token[] tokens = separateTokens(script, err).tokens;
356 	.destroy (err);
357 	string[] strTokens;
358 	strTokens.length = tokens.length;
359 	foreach (i, tok; tokens){
360 		strTokens[i] = tok.token;
361 	}
362 	/*import std.stdio : writeln;
363 	foreach(token; strTokens)
364 		writeln(token);*/
365 	assert (strTokens == [
366 			"function", "void", "main", "{",
367 			"int", "i", "=", "5", ";",
368 			".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";",
369 			"a", ".", "b", ".", "c", "=", "@", "a", ";",
370 			"a", "=", "5.5", ";",
371 			"a", "=", "-20", "+", "5", ";",
372 			"a", "=", "-20", "+", "5", ";",
373 			"a", "==", "-", "b", ";",
374 			"a", "<=", "b", ";",
375 			"a", ">", "b",
376 			"\"string\twith white\\\"Space\"", "\"unexpected string\""
377 		]);
378 }
379 
380 /// Returns the index of the quotation mark that ends a string
381 /// 
382 /// Returns -1 if not found
383 private integer strEnd(string s, uinteger i){
384 	const char end = s[i] == '\'' ? '\'' : '\"';
385 	for (i++;i<s.length;i++){
386 		if (s[i]=='\\'){
387 			i++;
388 			continue;
389 		}else if (s[i]==end){
390 			break;
391 		}
392 	}
393 	if (i==s.length){i=-1;}
394 	return i;
395 }
396 
397 private alias decodeString = strReplaceSpecial;
398 
399 /// decodes a string. i.e, converts \t to tab, \" to ", etc
400 /// The string must not be surrounded by quoation marks
401 /// 
402 /// Returns: string with special characters replaced with their actual characters (i.e, \t replaced with tab, \n with newline...)
403 private string strReplaceSpecial(char specialCharBegin='\\', char[char] map = ['t' : '\t', 'n' : '\n','\\':'\\'])
404 (string s){
405 	char[] r = [];
406 	for (uinteger i = 0; i < s.length; i ++){
407 		if (s[i] == specialCharBegin && i + 1 < s.length && s[i+1] in map){
408 			r ~= map[s[i+1]];
409 			i++;
410 			continue;
411 		}
412 		r ~= s[i];
413 	}
414 	return r;
415 }
416 
417 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type
418 /// in an array
419 /// 
420 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n
421 /// `errors` is the array to which erors will be put
422 /// 
423 /// As a plus, it also checks if the brackets are in correct order (and properly closed)
424 package TokenList toTokens(string[] script, ref CompileError[] errors){
425 	LinkedList!CompileError compileErrors = new LinkedList!CompileError;
426 	/// Returns true if a string has chars that only identifiers can have
427 	TokenList tokens = separateTokens(script, compileErrors);
428 	if (tokens.tokens == null || tokens.tokens.length == 0){
429 		// there's error
430 		errors = compileErrors.toArray;
431 		.destroy(compileErrors);
432 		return tokens;
433 	}else{
434 		// continue with identiying tokens
435 		// fill in tokens with tokenStrings' strings, and identify their type
436 		foreach(i, token; tokens.tokens){
437 			try{
438 				tokens.tokens[i].type = getTokenType(token.token);
439 			}catch(Exception e){
440 				compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg));
441 			}
442 		}
443 		// now check brackets
444 		tokens.checkBrackets(compileErrors);
445 		if (compileErrors.count > 0){
446 			errors = compileErrors.toArray;
447 		}
448 		.destroy(compileErrors);
449 		return tokens;
450 	}
451 }
452 
453 
454 /// Checks if the brackets in a tokenlist are in correct order, and are closed
455 /// 
456 /// In case not, returns false, and appends error to `errorLog`
457 private bool checkBrackets(TokenList tokens, LinkedList!CompileError errors){
458 	enum BracketType{
459 		Round,
460 		Square,
461 		Block
462 	}
463 	BracketType[Token.Type] brackOpenIdent = [
464 		Token.Type.ParanthesesOpen: BracketType.Round,
465 		Token.Type.IndexBracketOpen: BracketType.Square,
466 		Token.Type.BlockStart: BracketType.Block
467 	];
468 	BracketType[Token.Type] brackCloseIdent = [
469 		Token.Type.ParanthesesClose: BracketType.Round,
470 		Token.Type.IndexBracketClose: BracketType.Square,
471 		Token.Type.BlockEnd:BracketType.Block
472 	];
473 	Stack!BracketType bracks = new Stack!BracketType;
474 	Stack!uinteger bracksStartIndex = new Stack!uinteger;
475 	BracketType curType;
476 	bool r = true;
477 	for (uinteger lastInd = tokens.tokens.length-1, i = 0; i<=lastInd; i++){
478 		if (tokens.tokens[i].type in brackOpenIdent){
479 			bracks.push(brackOpenIdent[tokens.tokens[i].type]);
480 			bracksStartIndex.push(i);
481 		}else if (tokens.tokens[i].type in brackCloseIdent){
482 			bracksStartIndex.pop();
483 			if (bracks.pop != brackCloseIdent[tokens.tokens[i].type]){
484 				errors.append(CompileError(tokens.getTokenLine(i),
485 						"brackets order is wrong; first opened must be last closed"));
486 				r = false;
487 				break;
488 			}
489 		}else if (i == lastInd && bracks.count > 0){
490 			// no bracket ending
491 			i = -2;
492 			errors.append(CompileError(tokens.getTokenLine(bracksStartIndex.pop), "bracket not closed"));
493 			r = false;
494 			break;
495 		}
496 	}
497 
498 	.destroy(bracks);
499 	.destroy(bracksStartIndex);
500 	return r;
501 }
502 
503 /// Returns index of closing/openinig bracket of the provided bracket  
504 /// 
505 /// `forward` if true, then the search is in forward direction, i.e, the closing bracket is searched for
506 /// `tokens` is the array of tokens to search in
507 /// `index` is the index of the opposite bracket
508 /// 
509 /// It only works correctly if the brackets are in correct order, and the closing bracket is present  
510 /// so, before calling this, `compiler.misc.checkBrackets` should be called
511 package uinteger tokenBracketPos(bool forward=true)(Token[] tokens, uinteger index){
512 	Token.Type[] closingBrackets = [
513 		Token.Type.BlockEnd,
514 		Token.Type.IndexBracketClose,
515 		Token.Type.ParanthesesClose
516 	];
517 	Token.Type[] openingBrackets = [
518 		Token.Type.BlockStart,
519 		Token.Type.IndexBracketOpen,
520 		Token.Type.ParanthesesOpen
521 	];
522 	uinteger count; // stores how many closing/opening brackets before we reach the desired one
523 	uinteger i = index;
524 	for (uinteger lastInd = (forward ? tokens.length : 0); i != lastInd; (forward ? i ++: i --)){
525 		if ((forward ? openingBrackets : closingBrackets).hasElement(tokens[i].type)){
526 			count ++;
527 		}else if ((forward ? closingBrackets : openingBrackets).hasElement(tokens[i].type)){
528 			count --;
529 		}
530 		if (count == 0){
531 			break;
532 		}
533 	}
534 	return i;
535 }
536 ///
537 unittest{
538 	Token[] tokens;
539 	tokens = [
540 		Token(Token.Type.Comma, ","), Token(Token.Type.BlockStart,"{"), Token(Token.Type.Comma,","),
541 		Token(Token.Type.IndexBracketOpen,"["),Token(Token.Type.IndexBracketClose,"]"),Token(Token.Type.BlockEnd,"}")
542 	];
543 	assert(tokens.tokenBracketPos!true(1) == 5);
544 	assert(tokens.tokenBracketPos!false(5) == 1);
545 	assert(tokens.tokenBracketPos!true(3) == 4);
546 	assert(tokens.tokenBracketPos!false(4) == 3);
547 }