I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now all my records are getting classified correctly but the thing is the program is taking almost an hr to write the records into the corresponding files.What is the best way to improve performance of my code.Here is the below code.This piece of code is writing the 410k records into the corresponding files.Thank you.
fp=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
token = strtok(line,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token = strtok(NULL,",");
token1 = strtok(token,"\n");
memcpy(mystr,&token1[0],strlen(token1)-1);
mystr[strlen(token1)-1] = '\0';
if( strcmp(mystr,"ok") == 0 )
counter_ok++;
else
counter_fraud++;
}
printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);
prblty_ok = counter_ok/(counter_ok+counter_fraud);
prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);
fclose(fp);
fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","a");
fp3=fopen("sales_unknown_fraud_classified.txt","a");
while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
{
unknwn_attr1 = strtok(line1,",");
unknwn_attr2 = strtok(NULL,",");
unknwn_attr3 = strtok(NULL,",");
unknwn_attr4 = strtok(NULL,",");
unknwn_attr5 = strtok(NULL,",");
//printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
fp1=fopen("sales_ok_fraud.txt","r");
while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
{
ok_fraud_attr1 = strtok(line,",");
ok_fraud_attr2 = strtok(NULL,",");
ok_fraud_attr3 = strtok(NULL,",");
ok_fraud_attr4 = strtok(NULL,",");
ok_fraud_attr5 = strtok(NULL,",");
ok_fraud_attr6 = strtok(NULL,",");
memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
//ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';
//printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str));
if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_ok_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_ok_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_ok_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_ok_attr5++;
}
if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
{
if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
counter_fraud_attr2++;
if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
counter_fraud_attr3++;
if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
counter_fraud_attr4++;
if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
counter_fraud_attr5++;
}
}
fclose(fp1);
if(counter_ok_attr2 == 0)
prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);
if(counter_ok_attr3 == 0)
prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);
if(counter_ok_attr4 == 0)
prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);
if(counter_ok_attr5 == 0)
prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
else
prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);
if(counter_fraud_attr2 == 0)
prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);
if(counter_fraud_attr3 == 0)
prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);
if(counter_fraud_attr4 == 0)
prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);
if(counter_fraud_attr5 == 0)
prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
else
prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);
total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;
// printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
// printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
// printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
// printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
// printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);
if(total_prblty_ok > total_prblty_fraud)
{
fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
else
{
fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
}
counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}
fclose(fp);
fclose(fp2);
fclose(fp3);
There are a several things I can see right away you can do, in the order I would try them:
strlen()rampage needs to significantly be reduced. Most decent optimizing compilers will detect the unchanged source and optimize out the subsequent calls on a known-unchanged char-ptr, so I would do this last (but honestly I’d still do it as its a bad practice to call repeatedstrlen()invokes on the same data.Logic Reductions
You can cut out a ton of work in one place in particular, changing this:
To this:
Front-Loading
sales_ok_fraud.txtThe following relies on the sanctity of the data format of your
sales_ok_fraud.txtstats file, while trying to be as pedantic as possible in validating said format. It allocates a chunk of memory large enough to hold the entire file plus-one-char to treat the entire body as a single null-term-string. That buffer is then pieced-up via the same general algorithm you had prior. The result will be a table of pointers to fixed-length char-pointer arrays that can then be used iteratively in the same place you currently (and repeatedly) open, parse, use, and throw away all that content.Putting It Together
Incorporating everything above has the following effect on your code base:
These are just a few ideas. There are more things in there to-be-sure, but these should help enormously in processing your file single-forward-scan with continuous output, which is about as efficient as you’re going to get under these circumstances. Without question the combination of the big three: single file open+close, logic reductions and single-parse-cache’ing the sales_ok_fraud.txt file will have a huge improvement in performance, especially the first and last of these.
EDIT Assisted the OP in updating this processor to front-load the sales_ok_fraud.txt file content, thereby eliminating repeated loading, parsing, and promptly throwing out some 15000+ line of text to be parsed repeatedly (once per main-source input line). Answer above updated accordingly.