How can you use stringstream to tokenize a line that looks like this.
[label] opcode [arg1] [,arg2]
The label may not always be there but if it isn’t, there will be a white space. The opcode is always there and there is a space or tab in between opcode and arg1. Then there is no whitespace in between arg1 and arg2 but it is split by a comma.
Also, some blank lines will have white space on them so they need to be discarded.
‘#’ is a comment
So for instance:
#Sample Input
TOP NoP
L 2,1
VAL INT 0
This is just an example of the text file I’ll be reading in from. So in label for line one would be TOP and opcode would = NOP with no arguments being passed.
I’ve been working on it but I need a simpler way to tokenize and from what I’ve seen, stringstream seems to be the one I’d like to use so if anyone can tell me sort of how to do this, I’d really appreciate it.
I’ve been racking my brain on how to do this and just to show you that I’m not just asking without working, here is my current code:
int counter = 0;
int i = 0;
int j = 0;
int p = 0;
while (getline(myFile, line, '\n'))
{
if (line[0] == '#')
{
continue;
}
if (line.length() == 0)
{
continue;
}
if (line.empty())
{
continue;
}
// If the first letter isn't a tab or space then it's a label
if (line[0] != '\t' && line[0] != ' ')
{
string delimeters = "\t ";
int current;
int next = -1;
current = next + 1;
next = line.find_first_of( delimeters, current);
label = line.substr( current, next - current );
Symtablelab[i] = label;
Symtablepos[i] = counter;
if(next>0)
{
current = next + 1;
next = line.find_first_of(delimeters, current);
opcode = line.substr(current, next - current);
if (opcode != "WORDS" && opcode != "INT")
{
counter += 3;
}
if (opcode == "INT")
{
counter++;
}
if (next > 0)
{
delimeters = ", \n\t";
current = next + 1;
next = line.find_first_of(delimeters, current);
arg1 = line.substr(current, next-current);
if (opcode == "WORDS")
{
counter += atoi(arg1.c_str());
}
}
if (next > 0)
{
delimeters ="\n";
current = next +1;
next = line.find_first_of(delimeters,current);
arg2 = line.substr(current, next-current);
}
}
i++;
}
// If the first character is a tab or space then there is no label and we just need to get a counter
if (line[0] == '\t' || line[0] == ' ')
{
string delimeters = "\t \n";
int current;
int next = -1;
current = next + 1;
next = line.find_first_of( delimeters, current);
label = line.substr( current, next - current );
if(next>=0)
{
current = next + 1;
next = line.find_first_of(delimeters, current);
opcode = line.substr(current, next - current);
if (opcode == "\t" || opcode =="\n"|| opcode ==" ")
{
continue;
}
if (opcode != "WORDS" && opcode != "INT")
{
counter += 3;
}
if (opcode == "INT")
{
counter++;
}
if (next > 0)
{
delimeters = ", \n\t";
current = next + 1;
next = line.find_first_of(delimeters, current);
arg1 = line.substr(current, next-current);
if (opcode == "WORDS")
{
counter += atoi(arg1.c_str());
}
}
if (next > 0)
{
delimeters ="\n\t ";
current = next +1;
next = line.find_first_of(delimeters,current);
arg2 = line.substr(current, next-current);
}
}
}
}
myFile.clear();
myFile.seekg(0, ios::beg);
while(getline(myFile, line))
{
if (line.empty())
{
continue;
}
if (line[0] == '#')
{
continue;
}
if (line.length() == 0)
{
continue;
}
// If the first letter isn't a tab or space then it's a label
if (line[0] != '\t' && line[0] != ' ')
{
string delimeters = "\t ";
int current;
int next = -1;
current = next + 1;
next = line.find_first_of( delimeters, current);
label = line.substr( current, next - current );
if(next>0)
{
current = next + 1;
next = line.find_first_of(delimeters, current);
opcode = line.substr(current, next - current);
if (next > 0)
{
delimeters = ", \n\t";
current = next + 1;
next = line.find_first_of(delimeters, current);
arg1 = line.substr(current, next-current);
}
if (next > 0)
{
delimeters ="\n\t ";
current = next +1;
next = line.find_first_of(delimeters,current);
arg2 = line.substr(current, next-current);
}
}
if (opcode == "INT")
{
memory[p] = arg1;
p++;
continue;
}
if (opcode == "HALT" || opcode == "NOP" || opcode == "P_REGS")
{
memory[p] = opcode;
p+=3;
continue;
}
if(opcode == "J" || opcode =="JEQR" || opcode == "JNE" || opcode == "JNER" || opcode == "JLT" || opcode == "JLTR" || opcode == "JGT" || opcode == "JGTR" || opcode == "JLE" || opcode == "JLER" || opcode == "JGE" || opcode == "JGER" || opcode == "JR")
{
memory[p] = opcode;
memory[p+1] = arg1;
p+=3;
continue;
}
if (opcode == "WORDS")
{
int l = atoi(arg1.c_str());
for (int k = 0; k <= l; k++)
{
memory[p+k] = "0";
}
p+=l;
continue;
}
else
{
memory[p] = opcode;
memory[p+1] = arg1;
memory[p+2] = arg2;
p+=3;
}
}
// If the first character is a tab or space then there is no label and we just need to get a counter
if (line[0] == '\t' || line[0] == ' ')
{
string delimeters = "\t ";
int current;
int next = -1;
current = next + 1;
next = line.find_first_of( delimeters, current);
label = line.substr( current, next - current );
if(next>=0)
{
current = next + 1;
next = line.find_first_of(delimeters, current);
opcode = line.substr(current, next - current);
if (opcode == "\t" || opcode =="\n"|| opcode ==" "|| opcode == "")
{
continue;
}
if (next > 0)
{
delimeters = ", \n\t";
current = next + 1;
next = line.find_first_of(delimeters, current);
arg1 = line.substr(current, next-current);
}
if (next > 0)
{
delimeters ="\n\t ";
current = next +1;
next = line.find_first_of(delimeters,current);
arg2 = line.substr(current, next-current);
}
}
if (opcode == "INT")
{
memory[p] = arg1;
p++;
continue;
}
if (opcode == "HALT" || opcode == "NOP" || opcode == "P_REGS")
{
memory[p] = opcode;
p+=3;
continue;
}
if(opcode == "J" || opcode =="JEQR" || opcode == "JNE" || opcode == "JNER" || opcode == "JLT" || opcode == "JLTR" || opcode == "JGT" || opcode == "JGTR" || opcode == "JLE" || opcode == "JLER" || opcode == "JGE" || opcode == "JGER" || opcode == "JR")
{
memory[p] = opcode;
memory[p+1] = arg1;
p+=3;
continue;
}
if (opcode == "WORDS")
{
int l = atoi(arg1.c_str());
for (int k = 0; k <= l; k++)
{
memory[p+k] = "0";
}
p+=l;
continue;
}
else
{
memory[p] = opcode;
memory[p+1] = arg1;
memory[p+2] = arg2;
p+=3;
}
}
}
I would obviously like to make this much much better so any help would be greatly appreciated.
Before you go mad with maintaining those huge
ifstatemenets or trying to learn Boost Spirit, let’s try to write a very simple parser. This is a bit of a long post,and doesn’t get directly to the point so please bear with me.
First, we need a grammar, which seems to be dead simple:
In english: A line of code consists of an optional label, an opcode and an optional argument list. Arguments list is either a single argument (an integer) or an argument followed by a separator (comma) and another argument list.
Let’s first define two datastructures. Labels are supposed to be unique (right?), so we’ll have a set of strings so we can easily look them up at any time and possibly report an error if we find a duplicate label. The next one is a map of strings to
size_t, which acts as a symbol table of valid opcodes together with expected number of arguments for each opcode.I don’t know what exactly is
memoryin your code, but your way of calculating offsets to figure where to put arguments seems unneccesarily complicated. Let’s define a data structure that can elegantly hold a line of code instead. I’d do something like this:A syntax error is kind of an exceptional circumstance that’s not easily recoverable, so let’s deal with them by throwing exceptions. Our simple exception class can look like this:
Tokenizing, lexing and parsing are usualy separated tasks. But I guess for this simple example, we can combine tokenizer and lexer in one class. We already know the elements our grammer is made of, so let’s write a class that’ll take input as text and extract grammar elements from it. The interface could look like this:
And the work horse, a function that tries to makes sense of tokens and returns a
code_linestruct if everything goes fine:And here’s how we might use all this:
If the input was succesfuly parsed, we now got a vector of valid lines with all the info (labels, number of arguments) and can do pretty much anything we like with it. This code will be much easier to mantain and extend than yours, IMO. If you need to introduce a new opcode, for example, just make another entry in the map (
symbol_table). How’s that compared to yourifstatements? 🙂The only thing left is the actual implementation of the
token_streams methods. Here’s how I did it forget_label:And that’s it. I leave the rest of the implementation as an exercise for you. Hope it helped. 🙂