This code generates a random 16-character string using only A,C,T,G. It then checks whether this sequence is in the hash (unordered_map), and if not, inserts it and points to a dummy placeholder.
In its current form, it hangs at datact=16384 when the ‘for i loop’ requires 20000 iterations, despite the fact that there are 4^16 strings with ACTG.
But.. if the string length is changed to 8, 9, 10, 11.. to 15, or 17, 18.. it correctly iterates to 20000. Why does unordered_map refuse to hash new sequences, but only when those sequences are 16 characters long?
#include <string>
#include <vector>
#include <unordered_map>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <iostream>
using namespace std;
int main(int argc, char* argv[])
{
string funnelstring;
srand ( time(NULL) );
const int buffersize=10000;
int currentsize=buffersize;
int datact=0;
vector <unsigned int> ctarr(buffersize);
vector <char> nuc(4);
nuc[0]='A';
nuc[1]='C';
nuc[2]='T';
nuc[3]='G';
unordered_map <string,unsigned int*> location;
unsigned int sct;
sct=1;
for (int i=0;i<20000; i++)
{
do
{
funnelstring="";
for (int i=0; i<16; i++)
{ // generate random 16 nucleotide sequence
funnelstring+=nuc[(rand() % 4)];
}
} while (location.find(funnelstring) != location.end()); //asks whether this key has been assigned
ctarr[datact]=sct;
location[funnelstring]=&ctarr[datact]; //assign current key to point to data count
datact++;
cout << datact << endl;
if (datact>=currentsize)
{
ctarr.resize(currentsize+buffersize);
currentsize+=buffersize;
}
}
return 0;
}
As @us2012 said, the problem is your PRNG, and the poor randomness in the lower order bits. Here’s a relevant quote:
Also, as others have pointed out, you can also use a better, more modern RNG.