I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now

Question

0

Asked: June 14, 20262026-06-14T14:25:25+00:00 2026-06-14T14:25:25+00:00

I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now

0

I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now all my records are getting classified correctly but the thing is the program is taking almost an hr to write the records into the corresponding files.What is the best way to improve performance of my code.Here is the below code.This piece of code is writing the 410k records into the corresponding files.Thank you.

    fp=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                token = strtok(line,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token = strtok(NULL,",");
                token1 = strtok(token,"\n");
                memcpy(mystr,&token1[0],strlen(token1)-1);
                mystr[strlen(token1)-1] = '\0';

        if( strcmp(mystr,"ok") == 0 )
            counter_ok++;
        else 
        counter_fraud++;       
    }
    printf("The no. of records with OK label are %f\n",counter_ok);
    printf("The no. of records with FRAUD label are %f\n",counter_fraud);

    prblty_ok = counter_ok/(counter_ok+counter_fraud);
    prblty_fraud = counter_fraud/(counter_ok+counter_fraud);
    printf("The probability of OK records is %f\n",prblty_ok);
    printf("The probability of FRAUD records is %f\n",prblty_fraud);
    fclose(fp);

    fp=fopen("sales_unknwn.txt","r");
    fp2=fopen("sales_unknown_ok_classified.txt","a");
    fp3=fopen("sales_unknown_fraud_classified.txt","a");
    while(fgets(line1,80,fp)!=NULL) //Reading each line from file to calculate the file size.
        {
                unknwn_attr1 = strtok(line1,",");
                unknwn_attr2 = strtok(NULL,",");
                unknwn_attr3 = strtok(NULL,",");
                unknwn_attr4 = strtok(NULL,",");
                unknwn_attr5 = strtok(NULL,",");

        //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

        fp1=fopen("sales_ok_fraud.txt","r");
        while(fgets(line,80,fp1)!=NULL) //Reading each line from file to calculate the file size.
            {
            ok_fraud_attr1 = strtok(line,",");
                    ok_fraud_attr2 = strtok(NULL,",");
                    ok_fraud_attr3 = strtok(NULL,",");
                    ok_fraud_attr4 = strtok(NULL,",");
                    ok_fraud_attr5 = strtok(NULL,",");
            ok_fraud_attr6 = strtok(NULL,",");
                    memcpy(ok_fraud_attr6_str,&ok_fraud_attr6[0],strlen(ok_fraud_attr6)-2);
                    ok_fraud_attr6_str[strlen(ok_fraud_attr6)-2] = '\0';
            //ok_fraud_attr6[strlen(ok_fraud_attr6)-2] = '\0';      
            //printf("Testing ok_fraud_attr6 - %s-%d\n",ok_fraud_attr6_str,strlen(ok_fraud_attr6_str)); 
            if( strcmp(ok_fraud_attr6_str,"ok") == 0 )
            {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_ok_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_ok_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_ok_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_ok_attr5++;
            }

                    if( strcmp(ok_fraud_attr6_str,"fraud") == 0 )
                        {
                if( strcmp(unknwn_attr2,ok_fraud_attr2) == 0 )
                counter_fraud_attr2++;

                if( strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
                counter_fraud_attr3++;

                if( strcmp(unknwn_attr4,ok_fraud_attr4) == 0 )
                counter_fraud_attr4++;

                if( strcmp(unknwn_attr5,ok_fraud_attr5) == 0 )
                counter_fraud_attr5++;
            }
        }
        fclose(fp1);
        if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

        if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

        if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

        if(counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
        else
                prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

        if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

        if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

        if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

        if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
        else
                prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

        total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
        total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


//      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
//      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
//      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
//      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
//      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

        if(total_prblty_ok > total_prblty_fraud)
        {
            fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }   
        else
        {
            fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
        }
        counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
        counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
    }

    fclose(fp);
        fclose(fp2);    
        fclose(fp3);

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-14T14:25:26+00:00

There are a several things I can see right away you can do, in the order I would try them:

Stop with the repeated open-write-close, open-write-close ideology on your output files. Their names are fixed and limited. Open them all appropriately at the start of this thing, then flush-and-close when you’re done.
There are several logic-constructs that can be significantly simplified.
Your strlen() rampage needs to significantly be reduced. Most decent optimizing compilers will detect the unchanged source and optimize out the subsequent calls on a known-unchanged char-ptr, so I would do this last (but honestly I’d still do it as its a bad practice to call repeated strlen() invokes on the same data.
ADDED AFTER CONVERSING WITH OP :You repeatedly re-parse the same data file (sales_ok_fraud.txt) over and over, once for line of data in sales_unknwn.txt. 12gB/abg-line-length is a LOT of unneeded repetitious parsing if sales_ok_fraud.txt can fit in memory. Load that data once calculate its base stats once, and use the data and stats therein for the rest of your data crunch.

Logic Reductions

You can cut out a ton of work in one place in particular, changing this:

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"ok") == 0)
        counter_ok_attr5++;

    if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr2++;

    if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr3++;

    if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr4++;

    if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0 && strcmp(ok_fraud_attr6_str,"fraud") == 0)
        counter_fraud_attr5++;

To this:

    if (strcmp(ok_fraud_attr6_str, "ok") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_ok_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0 )
            counter_ok_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_ok_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_ok_attr5++;
    }
    else if (strcmp(ok_fraud_attr6_str,"fraud") == 0)
    {
        if(strcmp(unknwn_attr2,ok_fraud_attr2) == 0)
            counter_fraud_attr2++;

        if(strcmp(unknwn_attr3,ok_fraud_attr3) == 0)
            counter_fraud_attr3++;

        if(strcmp(unknwn_attr4,ok_fraud_attr4) == 0)
            counter_fraud_attr4++;

        if(strcmp(unknwn_attr5,ok_fraud_attr5) == 0)
            counter_fraud_attr5++;
    }

Front-Loading sales_ok_fraud.txt

The following relies on the sanctity of the data format of your sales_ok_fraud.txt stats file, while trying to be as pedantic as possible in validating said format. It allocates a chunk of memory large enough to hold the entire file plus-one-char to treat the entire body as a single null-term-string. That buffer is then pieced-up via the same general algorithm you had prior. The result will be a table of pointers to fixed-length char-pointer arrays that can then be used iteratively in the same place you currently (and repeatedly) open, parse, use, and throw away all that content.

// declare an array of six string pointers
typedef char *OFAttribs[6];

// loads a table consisting of the following format:
//
// str1,str2,str3,str4,str5,str6\n
// str1,str2,str3,str4,str5,str6\n
// ...
// str1,str2,str3,str4,str5,str6
//
// any deviation from the above will cause premature termination of the loop
//  but will return whatever was able to be parsed up to the point of failure.
//  the caller should therefore always `free()` the resulting table and data
//  pointers.

size_t init_ok_fraud_data(const char *fname, OFAttribs **ppTable, char **ppTableData)
{
    if (!fname || !*fname)
        return 0;

    // check file open for thumbs up
    FILE *fp = fopen(fname, "rb");
    if (!fp)
        return 0;

    // allocate enough memory to hold the entire file, plus a terminator
    fseek(fp, 0,  SEEK_END);
    long len = ftell(fp);
    fseek(fp, 0,  SEEK_SET);

    // allocate enough ram for the entire file plus terminator
    OFAttribs *pTable = NULL;
    size_t nTableLen = 0;
    char *pTableData =  malloc((len+1) * sizeof(char));
    if (NULL != pTableData)
    {
        fread(pTableData , len, 1, fp);
        pTableData[len] = 0;
    }

    // no longer need the file
    fclose(fp);

    // prime first token
    char *token = strtok(pTableData, ",");
    while (token)
    {
        // read next line of tokens
        OFAttribs attribs = { NULL };
        for (int i=0;i<4 && token; ++i)
        {
            attribs[i] = token;
            token = strtok(NULL, ",");
        }

        // filled 0..3, set lat token and move on
        if (attribs[3] && token)
        {
            // next-to-last entry set
            attribs[4] = token;

            // line enter is only terminated by newline
            token = strtok(NULL, "\n");
            if (token)
            {
                // proper format. 6 parms, 5 commas, one new-line.
                attribs[5] = token;
                size_t slen = strlen(token);
                if (slen > 0)
                {
                    while (isspace(token[--slen]))
                        token[slen] = 0;
                }

                // make space on the master list for another.
                OFAttribs *tmp = realloc(pTable, sizeof(*tmp) * (nTableLen+1));
                if (NULL != tmp)
                {
                    pTable = tmp;
                    memcpy(pTable + nTableLen++, attribs, sizeof(attribs));
                }
                else
                {   // allocation failure.
                    printf("Error allocating memory for expanding OKFraud data set");
                    exit(EXIT_FAILURE);
                }
            }
            else
            {   // not good.
                printf("Invalid line format detected. Expected ok/fraud\\n");
                break;
            }

            // next token of new line
            token = strtok(NULL, ",");
        }
    }

    // set output variables
    *ppTable = pTable;
    *ppTableData = pTableData;
    return nTableLen;
}

Putting It Together

Incorporating everything above has the following effect on your code base:

// load the ok_fraud table ONCE.
OFAttribs *okfr = NULL;
char *okfr_data = NULL;
size_t okfr_len = init_ok_fraud_data("sales_ok_fraud.txt", &okfr, &okfr_data);

// walk table to determine probabilities of ok and fraud states.
//  note: this really should be done as part of the loader.
for (size_t i=0;i<okfr_len; ++i)
{
    if (0 == strcmp("ok", okfr[i][5]))
        ++counter_ok;
    else
        ++counter_fraud;
}

printf("The no. of records with OK label are %f\n",counter_ok);
printf("The no. of records with FRAUD label are %f\n",counter_fraud);

// compute probabilites for ok and fraud states
prblty_ok = (float)counter_ok/(float)(okfr_len);
prblty_fraud = (float)counter_fraud/(float)(okfr_len);
printf("The probability of OK records is %f\n",prblty_ok);
printf("The probability of FRAUD records is %f\n",prblty_fraud);

fp=fopen("sales_unknwn.txt","r");
fp2=fopen("sales_unknown_ok_classified.txt","w");
fp3=fopen("sales_unknown_fraud_classified.txt","w");
while(fgets(line1,sizeof(line1),fp)!=NULL) //Reading each line from file to calculate the file size.
{
    char *unknwn_attr1 = strtok(line1,",");
    char *unknwn_attr2 = strtok(NULL,",");
    char *unknwn_attr3 = strtok(NULL,",");
    char *unknwn_attr4 = strtok(NULL,",");
    char *unknwn_attr5 = strtok(NULL,",");

    //printf("%s-%s-%s-%s-%s\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);

    for (size_t i=0;i<okfr_len; ++i)
    {

        if( strcmp(okfr[i][5], "ok") == 0 )
        {
            // ok case
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_ok_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_ok_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_ok_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_ok_attr5++;
        }

        else // fraud case
        {
            if( strcmp(unknwn_attr2, okfr[i][1]) == 0 )
                counter_fraud_attr2++;

            if( strcmp(unknwn_attr3, okfr[i][2]) == 0 )
                counter_fraud_attr3++;

            if( strcmp(unknwn_attr4, okfr[i][3]) == 0 )
                counter_fraud_attr4++;

            if( strcmp(unknwn_attr5, okfr[i][4]) == 0 )
                counter_fraud_attr5++;
        }
    }

    if(counter_ok_attr2 == 0)
        prblty_attr2_given_ok = (counter_ok_attr2+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr2_given_ok = (counter_ok_attr2)/(counter_ok);

    if(counter_ok_attr3 == 0)
        prblty_attr3_given_ok = (counter_ok_attr3+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr3_given_ok = (counter_ok_attr3)/(counter_ok);

    if(counter_ok_attr4 == 0)
        prblty_attr4_given_ok = (counter_ok_attr4+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr4_given_ok = (counter_ok_attr4)/(counter_ok);

    if (counter_ok_attr5 == 0)
        prblty_attr5_given_ok = (counter_ok_attr5+arbitrary_value*prblty_ok)/(counter_ok+arbitrary_value);
    else
        prblty_attr5_given_ok = (counter_ok_attr5)/(counter_ok);

    if(counter_fraud_attr2 == 0)
        prblty_attr2_given_fraud = (counter_fraud_attr2+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr2_given_fraud = (counter_fraud_attr2)/(counter_fraud);

    if(counter_fraud_attr3 == 0)
        prblty_attr3_given_fraud = (counter_fraud_attr3+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr3_given_fraud = (counter_fraud_attr3)/(counter_fraud);

    if(counter_fraud_attr4 == 0)
        prblty_attr4_given_fraud = (counter_fraud_attr4+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr4_given_fraud = (counter_fraud_attr4)/(counter_fraud);

    if(counter_fraud_attr5 == 0)
        prblty_attr5_given_fraud = (counter_fraud_attr5+arbitrary_value*prblty_fraud)/(counter_fraud+arbitrary_value);
    else
        prblty_attr5_given_fraud = (counter_fraud_attr5)/(counter_fraud);

    total_prblty_ok = prblty_ok*prblty_attr2_given_ok*prblty_attr3_given_ok*prblty_attr4_given_ok*prblty_attr5_given_ok;
    total_prblty_fraud = prblty_fraud*prblty_attr2_given_fraud*prblty_attr3_given_fraud*prblty_attr4_given_fraud*prblty_attr5_given_fraud;


    //      printf("Testing counts for OK - %f - %f - %f - %f\n",counter_ok_attr2,counter_ok_attr3,counter_ok_attr4,counter_ok_attr5);
    //      printf("Testing counts for FRAUD - %f - %f - %f - %f\n",counter_fraud_attr2,counter_fraud_attr3,counter_fraud_attr4,counter_fraud_attr5);
    //      printf("Testing attribute probabilities for OK - %f - %f - %f - %f\n",prblty_attr2_given_ok,prblty_attr3_given_ok,prblty_attr4_given_ok,prblty_attr5_given_ok);
    //      printf("Testing attribute probabilities for FRAUD- %f - %f - %f - %f\n",prblty_attr2_given_fraud,prblty_attr3_given_fraud,prblty_attr4_given_fraud,prblty_attr5_given_fraud);
    //      printf("The final probabilities are %f - %f\n",total_prblty_ok,total_prblty_fraud);

    if(total_prblty_ok > total_prblty_fraud)
    {
        fprintf(fp2,"%s,%s,%s,%s,%s,ok\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    else
    {
        fprintf(fp3,"%s,%s,%s,%s,%s,fraud\n",unknwn_attr1,unknwn_attr2,unknwn_attr3,unknwn_attr4,unknwn_attr5);
    }
    counter_ok_attr2=counter_ok_attr3=counter_ok_attr4=counter_ok_attr5=0;
    counter_fraud_attr2=counter_fraud_attr3=counter_fraud_attr4=counter_fraud_attr5=0;
}

// free the table data and dynamic pointer array
free(okfr);
free(okfr_data);

fclose(fp);
fclose(fp2);
fclose(fp3);
return 0;

These are just a few ideas. There are more things in there to-be-sure, but these should help enormously in processing your file single-forward-scan with continuous output, which is about as efficient as you’re going to get under these circumstances. Without question the combination of the big three: single file open+close, logic reductions and single-parse-cache’ing the sales_ok_fraud.txt file will have a huge improvement in performance, especially the first and last of these.

EDIT Assisted the OP in updating this processor to front-load the sales_ok_fraud.txt file content, thereby eliminating repeated loading, parsing, and promptly throwing out some 15000+ line of text to be parsed repeatedly (once per main-source input line). Answer above updated accordingly.

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

I have implemented Naive Bayes algorithm on a large data set of 410k rows.Now

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply