I want to find a programmatic solution using C++.
I have a 900 files each of 27MB size. (just to inform about the enormity ).
Each file has 55K rows and Varying columns. But the header indicates the columns
I want to sort the rows in an order w.r.t to a Column Value.
I wrote the sorting algorithm for this (definitely my newbie attempts, you may say).
This algorithm is working for few numbers, but fails for larger numbers.
Here is the code for the same:
basic functions I defined to use inside the main code:
int getNumberOfColumns(const string& aline)
{
int ncols=0;
istringstream ss(aline);
string s1;
while(ss>>s1) ncols++;
return ncols;
}
vector<string> getWordsFromSentence(const string& aline)
{
vector<string>words;
istringstream ss(aline);
string tstr;
while(ss>>tstr) words.push_back(tstr);
return words;
}
bool findColumnName(vector<string> vs, const string& colName)
{
vector<string>::iterator it = find(vs.begin(), vs.end(), colName);
if ( it != vs.end())
return true;
else return false;
}
int getIndexForColumnName(vector<string> vs, const string& colName)
{
if ( !findColumnName(vs,colName) ) return -1;
else {
vector<string>::iterator it = find(vs.begin(), vs.end(), colName);
return it - vs.begin();
}
}
////////// I like the Recurssive functions - I tried to create a recursive function
///here. This worked for small values , say 20 rows. But for 55K - core dumps
void sort2D(vector<string>vn, vector<string> &srt, int columnIndex)
{
vector<double> pVals;
for ( int i = 0; i < vn.size(); i++) {
vector<string>meancols = getWordsFromSentence(vn[i]);
pVals.push_back(stringToDouble(meancols[columnIndex]));
}
srt.push_back(vn[max_element(pVals.begin(), pVals.end())-pVals.begin()]);
if (vn.size() > 1 ) {
vn.erase(vn.begin()+(max_element(pVals.begin(), pVals.end())-pVals.begin()) );
vector<string> vn2 = vn;
//cout<<srt[srt.size() -1 ]<<endl;
sort2D(vn2 , srt, columnIndex);
}
}
Now the main code:
for ( int i = 0; i < TissueNames.size() -1; i++)
{
for ( int j = i+1; j < TissueNames.size(); j++)
{
//string fname = path+"/gse7307_Female_rma"+TissueNames[i]+"_"+TissueNames[j]+".txt";
//string fname2 = sortpath2+"/gse7307_Female_rma"+TissueNames[i]+"_"+TissueNames[j]+"Sorted.txt";
string fname = path+"/gse7307_Male_rma"+TissueNames[i]+"_"+TissueNames[j]+".txt";
string fname2 = sortpath2+"/gse7307_Male_rma"+TissueNames[i]+"_"+TissueNames[j]+"4Columns.txt";
vector<string>AllLinesInFile;
BioInputStream fin(fname);
string aline;
getline(fin,aline);
replace (aline.begin(), aline.end(), '"',' ');
string headerline = aline;
vector<string> header = getWordsFromSentence(aline);
int pindex = getIndexForColumnName(header,"p-raw");
int xcindex = getIndexForColumnName(header,"xC");
int xeindex = getIndexForColumnName(header,"xE");
int prbindex = getIndexForColumnName(header,"X");
string newheaderline = "X\txC\txE\tp-raw";
BioOutputStream fsrt(fname2);
fsrt<<newheaderline<<endl;
int newpindex=3;
while ( getline(fin, aline) ){
replace (aline.begin(), aline.end(), '"',' ');
istringstream ss2(aline);
string tstr;
ss2>>tstr;
tstr = ss2.str().substr(tstr.length()+1);
vector<string> words = getWordsFromSentence(tstr);
string values = words[prbindex]+"\t"+words[xcindex]+"\t"+words[xeindex]+"\t"+words[pindex];
AllLinesInFile.push_back(values);
}
vector<string>SortedLines;
sort2D(AllLinesInFile, SortedLines,newpindex);
for ( int si = 0; si < SortedLines.size(); si++)
fsrt<<SortedLines[si]<<endl;
cout<<"["<<i<<","<<j<<"] = "<<SortedLines.size()<<endl;
}
}
can some one suggest me a better way of doing this?
why it is failing for larger values. ?
The primary function of interest for this query is Sort2D function.
thanks for the time and patience.
prasad.
I’m not sure why your code is crashing, but recursion in that case is only going to make the code less readable. I doubt it’s a stack overflow, however, because you’re not using much stack space in each call.
C++ already has
std::sort, why not use that instead? You could do it like this:To then sort your lines you would call
Now, there is one problem. This will be slow because
stringToDoubleandgetWordsFromSentenceare called multiple times on the same string. You would probably want to generate a separate vector which has precalculated the values of each string, and then haveCompareByStringValuejust use that vector as a lookup table.Another way you can do this is insert the strings into a
std::multimap<double, std::string>. Just insert the entries as(value, str)and then read them out line-by-line. This is simpler but slower (though has the same big-O complexity).EDIT: Cleaned up some incorrect code and derived from
binary_function.