1 /++
2 For reading a script into tokens
3 +/
4 module qscript.compiler.tokengen;
5 
6 import utils.misc;
7 import utils.lists;
8 import std.conv:to;
9 import qscript.compiler.misc;
10 
11 /// stores errors for tokengen
12 private LinkedList!CompileError compileErrors;
13 
14 /// Attempts to identify a token type by the token (string).
15 /// returns token type, if fails, throws exception
16 private Token.Type getTokenType(string token){
17 	/// Returns true if a string is a keyword
18 	bool isKeyword(string s){
19 		return KEYWORDS.hasElement(s);
20 	}
21 	/// Returns true if a string is an identifier
22 	bool isIdentifier(string s){
23 		// token that qualifies as a number can qualify as an identifier, but not vice versa, so this if statement
24 		if (!token.isNum && !isKeyword(token)){
25 			return (cast(char[])s).matchElements(cast(char[])IDENT_CHARS);
26 		}else{
27 			return false;
28 		}
29 	}
30 	/// Returns true is a string is an operator
31 	bool isOperator(string s){
32 		return OPERATORS.hasElement(s) || SOPERATORS.hasElement(s);
33 	}
34 	/// Returns true if string contains an integer
35 	bool isInt(string s){
36 		return isNum(s, false);
37 	}
38 	/// Returns true if a string contains a double
39 	/// 
40 	/// to be identified as a double, the number must have a decimal point in it
41 	bool isDouble(string s){
42 		return isNum(s, true);
43 	}
44 	if (token == "."){
45 		return Token.Type.MemberSelector;
46 	}else if (token == "="){
47 		return Token.Type.AssignmentOperator;
48 	}else if (isInt(token)){
49 		return Token.Type.Integer;
50 	}else if (isDouble(token)){
51 		return Token.Type.Double;
52 	}else if (DATA_TYPES.hasElement(token)){
53 		return Token.Type.DataType;
54 	}else if (isKeyword(token)){
55 		return Token.Type.Keyword;
56 	}else if (isIdentifier(token)){
57 		return Token.Type.Identifier;
58 	}else if (isOperator(token)){
59 		return Token.Type.Operator;
60 	}else if (token[0] == '"'){
61 		return Token.Type.String;
62 	}else if (token[0] == '\''){
63 		if (token.length < 3)
64 			throw new Exception("no character provided inside ''");
65 		if (decodeString(token[1 .. $-1]).length > 1)
66 			throw new Exception("'' can only hold 1 character");
67 		return Token.Type.Char;
68 	}else if (token == ";"){
69 		return Token.Type.StatementEnd;
70 	}else if (token == ","){
71 		return Token.Type.Comma;
72 	}else if (token == "("){
73 		return Token.Type.ParanthesesOpen;
74 	}else if (token == ")"){
75 		return Token.Type.ParanthesesClose;
76 	}else if (token == "["){
77 		return Token.Type.IndexBracketOpen;
78 	}else if (token == "]"){
79 		return Token.Type.IndexBracketClose;
80 	}else if (token == "{"){
81 		return Token.Type.BlockStart;
82 	}else if (token == "}"){
83 		return Token.Type.BlockEnd;
84 	}else{
85 		throw new Exception("unidentified token type '"~token~'\'');
86 	}
87 }
88 ///
89 unittest{
90 	assert("thisIsAVar_1234".getTokenType == Token.Type.Identifier);
91 	assert("24.5".getTokenType == Token.Type.Double);
92 	assert("245".getTokenType == Token.Type.Integer);
93 	assert("\"This is a string\"".getTokenType == Token.Type.String);
94 	assert("==".getTokenType == Token.Type.Operator);
95 	assert(";".getTokenType == Token.Type.StatementEnd);
96 	assert(",".getTokenType == Token.Type.Comma);
97 	assert("int".getTokenType == Token.Type.DataType);
98 	assert("double".getTokenType == Token.Type.DataType);
99 	assert("char".getTokenType == Token.Type.DataType);
100 	assert("function".getTokenType == Token.Type.Keyword);
101 	assert("if".getTokenType == Token.Type.Keyword);
102 	assert("while".getTokenType == Token.Type.Keyword);
103 	assert("else".getTokenType == Token.Type.Keyword);
104 	assert(".".getTokenType == Token.Type.MemberSelector);
105 	assert("\'p\'".getTokenType == Token.Type.Char);
106 }
107 
108 /// returns Token[] with type identified based on string[] input
109 package Token[] stringToTokens(string[] s){
110 	Token[] r;
111 	r.length = s.length;
112 	foreach (i, token; s){
113 		r[i].type = getTokenType(s[i]);
114 		r[i].token = s[i].dup;
115 	}
116 	return r;
117 }
118 
119 /// Reads script, and separates tokens
120 private TokenList separateTokens(string[] script){
121 	static bool isDifferent(char c, ref char[] token){
122 		static const SEPERATORS = ['(','[','{','}',']',')', ';', ','];
123 		static const WHITESPACE = [' ', '\t'];
124 		static char[] lastToken = []; /// stores last complete token, used to check if `-` or `.` is to be considered operator or part of number
125 		static char pendingTokenChar = 0; /// as the name says...
126 		if (pendingTokenChar != 0){
127 			token = [pendingTokenChar];
128 			pendingTokenChar = 0;
129 			if (SEPERATORS.hasElement(token[0])){
130 				if (!WHITESPACE.hasElement(c))
131 					pendingTokenChar = c;
132 				lastToken = token.dup;
133 				return true;
134 			}
135 		}
136 		if (WHITESPACE.hasElement(c)){
137 			if (token.length > 0){
138 				lastToken = token.dup;
139 				return true;
140 			}
141 			return false;
142 		}
143 		if (SEPERATORS.hasElement(c)){
144 			if (token.length == 0){
145 				token = [c];
146 				lastToken = token.dup;
147 				return true;
148 			}
149 			pendingTokenChar = c;
150 			lastToken = token.dup;
151 			return true;
152 		}
153 		if (token.length > 0){
154 			// strings
155 			if (token[0] == '\"' || token[0] == '\''){
156 				token = token ~ c;
157 				if (c == token[0] && token[$-1] != '\\'){
158 					lastToken = token.dup;
159 					return true;
160 				}
161 			}
162 			// unexpected strings get read as separate tokens
163 			if ((c == '\"' || c == '\'') && token[0] != c){
164 				pendingTokenChar = c;
165 				lastToken = token.dup;
166 				return true;
167 			}
168 			// space
169 			if (c == ' ' || c == '\t'){
170 				lastToken = token.dup;
171 				return true;
172 			}
173 			// - is operator or part of number
174 			if (token[0] == '-' && isNum([c],false) && !(lastToken.matchElements(cast(char[])IDENT_CHARS))){
175 				token = token ~ c;
176 				// go on
177 				return false;
178 			}
179 			// . is memberSelector or decimal place
180 			if (c == '.' && !isNum(cast(string)token, false)){
181 				lastToken = token;
182 				pendingTokenChar = c;
183 				return true;
184 			}
185 			// token is operator
186 			if (OPERATORS.hasElement(cast(string)token) || SOPERATORS.hasElement(cast(string)token)){
187 				// see if it's still operator after adding c
188 				if (OPERATORS.hasElement(cast(string)(token ~ c)) || SOPERATORS.hasElement(cast(string)(token ~ c))){
189 					// go on
190 					token = token ~ c;
191 					return false;
192 				}else{
193 					pendingTokenChar = c;
194 					lastToken = token.dup;
195 					return true;
196 				}
197 			}else if ((OPERATORS.hasElement(cast(string)[c]) || SOPERATORS.hasElement(cast(string)[c])) && !isNum(cast(string)(token~c))){
198 				// token not operator, c is operator
199 				pendingTokenChar = c;
200 				lastToken = token.dup;
201 				return true;
202 			}
203 		}
204 		// nothing else matches, just add it to end
205 		token = token ~ c;
206 		return false;
207 	}
208 	LinkedList!string tokens = new LinkedList!string;
209 	uinteger[] tokenPerLine;
210 	tokenPerLine.length = script.length;
211 	uinteger tokenCount = 0;
212 	foreach (lineno, line; script){
213 		integer stringEndIndex = -1;
214 		char[] token = [];
215 		for (uinteger i = 0; i < line.length; i ++){
216 			// skip strings
217 			if ((line[i] == '"' || line[i] == '\'') && i > stringEndIndex){
218 				stringEndIndex = line.strEnd(i);
219 				if (stringEndIndex == -1){
220 					compileErrors.append(CompileError(lineno, "string not closed"));
221 					break;
222 				}
223 			}
224 			// break at comments
225 			if (line[i] == '#' && cast(integer)i > stringEndIndex){
226 				isDifferent(' ', token);
227 				// add pending token
228 				if (token.length){
229 					tokens.append(cast(string)token.dup);
230 					token = [];
231 				}
232 				break;
233 			}
234 			// hand this line[i] to isDifferent
235 			if (isDifferent(line[i], token)){
236 				tokens.append(cast(string)token.dup);
237 				token = [];
238 			}
239 		}
240 		isDifferent(' ', token);
241 		if (token.length)
242 			tokens.append(cast(string)token.dup);
243 		tokenPerLine[lineno] = tokens.count - tokenCount;
244 		tokenCount += tokenPerLine[lineno];
245 	}
246 	// put them all in TokenList
247 	TokenList r;
248 	r.tokenPerLine = tokenPerLine; // no need to dup it
249 	r.tokens = stringToTokens(tokens.toArray);
250 	.destroy (tokens);
251 	return r;
252 }
253 ///
254 unittest{
255 	string[] script = [
256 		"function void main{",
257 		"\tint i = 5;",
258 		"\t.5sdfdf = (!5 - 5);",
259 		"\ta.b.c = @a;",
260 		"\ta = 5.5;",
261 		" a = -20+5;",
262 		" a=-20+5;",
263 		" a == -b;",
264 		"a <= b;",
265 		"a > b",
266 	];
267 	Token[] tokens = separateTokens(script).tokens;
268 	string[] strTokens;
269 	strTokens.length = tokens.length;
270 	foreach (i, tok; tokens){
271 		strTokens[i] = tok.token;
272 	}
273 	/*import std.stdio : writeln;
274 	foreach(token; strTokens)
275 		writeln(token);*/
276 	assert (strTokens == [
277 			"function", "void", "main", "{",
278 			"int", "i", "=", "5", ";",
279 			".", "5sdfdf", "=", "(", "!", "5", "-", "5", ")", ";",
280 			"a", ".", "b", ".", "c", "=", "@", "a", ";",
281 			"a", "=", "5.5", ";",
282 			"a", "=", "-20", "+", "5", ";",
283 			"a", "=", "-20", "+", "5", ";",
284 			"a", "==", "-", "b", ";",
285 			"a", "<=", "b", ";",
286 			"a", ">", "b"
287 		]);
288 }
289 /// Takes script, and separates into tokens (using `separateTokens`), identifies token types, retuns the Tokens with Token.Type
290 /// in an array
291 /// 
292 /// `script` is the script to convert to tokens, each line is a separate string, without ending \n
293 /// `errors` is the array to which erors will be put
294 /// 
295 /// As a plus, it also checks if the brackets are in correct order (and properly closed)
296 package TokenList toTokens(string[] script, ref CompileError[] errors){
297 	compileErrors = new LinkedList!CompileError;
298 	/// Returns true if a string has chars that only identifiers can have
299 	TokenList tokens = separateTokens(script);
300 	if (tokens.tokens == null || tokens.tokens.length == 0){
301 		// there's error
302 		errors = compileErrors.toArray;
303 		.destroy(compileErrors);
304 		return tokens;
305 	}else{
306 		// continue with identiying tokens
307 		// fill in tokens with tokenStrings' strings, and identify their type
308 		foreach(i, token; tokens.tokens){
309 			try{
310 				tokens.tokens[i].type = getTokenType(token.token);
311 			}catch(Exception e){
312 				compileErrors.append(CompileError(tokens.getTokenLine(i), e.msg));
313 			}
314 		}
315 		// now check brackets
316 		tokens.checkBrackets(compileErrors);
317 		if (compileErrors.count > 0){
318 			errors = compileErrors.toArray;
319 		}
320 		.destroy(compileErrors);
321 		return tokens;
322 	}
323 }