Sign Up

Sign Up to our social questions and Answers Engine to ask questions, answer people’s questions, and connect with other people.

Have an account? Sign In

Have an account? Sign In Now

Sign In

Login to our social questions & Answers Engine to ask questions answer people’s questions & connect with other people.

Sign Up Here

Forgot Password?

Don't have account, Sign Up Here

Forgot Password

Lost your password? Please enter your email address. You will receive a link and will create a new password via email.

Have an account? Sign In Now

You must login to ask a question.

Forgot Password?

Need An Account, Sign Up Here

Please briefly explain why you feel this question should be reported.

Please briefly explain why you feel this answer should be reported.

Please briefly explain why you feel this user should be reported.

Sign InSign Up

The Archive Base

The Archive Base Logo The Archive Base Logo

The Archive Base Navigation

  • SEARCH
  • Home
  • About Us
  • Blog
  • Contact Us
Search
Ask A Question

Mobile menu

Close
Ask a Question
  • Home
  • Add group
  • Groups page
  • Feed
  • User Profile
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Buy Points
  • Users
  • Help
  • Buy Theme
  • SEARCH
Home/ Questions/Q 732621
In Process

The Archive Base Latest Questions

Editorial Team
  • 0
Editorial Team
Asked: May 14, 20262026-05-14T07:09:47+00:00 2026-05-14T07:09:47+00:00

I wanted to write a program that test if two files are duplicates (have

  • 0

I wanted to write a program that test if two files are duplicates (have exactly the same content). First I test if the files have the same sizes, and if they have i start to compare their contents.

My first idea, was to “split” the files into fixed size blocks, then start a thread for every block, fseek to startup character of every block and continue the comparisons in parallel. When a comparison from a thread fails, the other working threads are canceled, and the program exits out of the thread spawning loop.

The code looks like this:
dupf.h

#ifndef __NM__DUPF__H__
#define __NM__DUPF__H__
#define NUM_THREADS 15
#define BLOCK_SIZE 8192

/* Thread argument structure */
struct thread_arg_s {
    const char *name_f1;        /* First file name */
    const char *name_f2;        /* Second file name */
    int cursor;                 /* Where to seek in the file */
};
typedef struct thread_arg_s thread_arg;

/**
 * 'arg' is of type thread_arg.
 * Checks if the specified file blocks are 
 * duplicates.
 */
void *check_block_dup(void *arg);

/**
 * Checks if two files are duplicates
 */
int check_dup(const char *name_f1, const char *name_f2);

/**
* Returns a valid pointer to a file.
* If the file (given by the path/name 'fname') cannot be opened
* in 'mode', the program is interrupted an error message is shown.
**/
FILE *safe_fopen(const char *name, const char *mode);

#endif

dupf.c

#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"

FILE *safe_fopen(const char *fname, const char *mode)
{
    FILE *f = NULL;
    f = fopen(fname, mode);
    if (f == NULL) {
        char emsg[255];
        sprintf(emsg, "FOPEN() %s\t", fname);
        perror(emsg);
        exit(-1);
    }
    return (f);
}

void *check_block_dup(void *arg)
{
    const char *name_f1 = NULL, *name_f2 = NULL;    /* File names */
    FILE *f1 = NULL, *f2 = NULL;                    /* Streams */
    int cursor = 0;                                 /* Reading cursor */
    char buff_f1[BLOCK_SIZE], buff_f2[BLOCK_SIZE];  /* Character buffers */
    int rchars_1, rchars_2;                         /* Readed characters */
    /* Initializing variables from 'arg' */
    name_f1 = ((thread_arg*)arg)->name_f1;
    name_f2 = ((thread_arg*)arg)->name_f2;
    cursor = ((thread_arg*)arg)->cursor;
    /* Opening files */
    f1 = safe_fopen(name_f1, "r");
    f2 = safe_fopen(name_f2, "r");
    /* Setup cursor in files */
    fseek(f1, cursor, SEEK_SET);
    fseek(f2, cursor, SEEK_SET);
    /* Initialize buffers */
    rchars_1 = fread(buff_f1, 1, BLOCK_SIZE, f1);
    rchars_2 = fread(buff_f2, 1, BLOCK_SIZE, f2);
    if (rchars_1 != rchars_2) {
        /* fread failed to read the same portion.
         * program cannot continue */
        perror("ERROR WHEN READING BLOCK");
        exit(-1);
    }
    while (rchars_1-->0) {
        if (buff_f1[rchars_1] != buff_f2[rchars_1]) {
            /* Different characters */
            fclose(f1);
            fclose(f2);
            pthread_exit("notdup");
        }
    }
    /* Close streams */
    fclose(f1);
    fclose(f2);
    pthread_exit("dup");
}

int check_dup(const char *name_f1, const char *name_f2)
{
    int num_blocks = 0;             /* Number of 'blocks' to check */
    int num_tsp = 0;                /* Number of threads spawns */
    int tsp_iter = 0;               /* Iterator for threads spawns */
    pthread_t *tsp_threads = NULL;
    thread_arg *tsp_threads_args = NULL;
    int tsp_threads_iter = 0;
    int thread_c_res = 0;           /* Thread creation result */
    int thread_j_res = 0;           /* Thread join res */
    int loop_res = 0;               /* Function result */
    int cursor;
    struct stat buf_f1;
    struct stat buf_f2;

    if (name_f1 == NULL || name_f2 == NULL) {
        /* Invalid input parameters */
        perror("INVALID FNAMES\t");
        return (-1);
    }

    if (stat(name_f1, &buf_f1) != 0 || stat(name_f2, &buf_f2) != 0) {
        /* Stat fails */
        char emsg[255];
        sprintf(emsg, "STAT() ERROR: %s %s\t", name_f1, name_f2);
        perror(emsg);
        return (-1);
    }

    if (buf_f1.st_size != buf_f2.st_size) {
        /* File have different sizes */
        return (1);
    }

    /* Files have the same size, function exec. is continued */
    num_blocks = (buf_f1.st_size / BLOCK_SIZE) + 1;
    num_tsp = (num_blocks / NUM_THREADS) + 1;
    cursor = 0;
    for (tsp_iter = 0; tsp_iter < num_tsp; tsp_iter++) {
        loop_res = 0;
        /* Create threads array for this spawn */
        tsp_threads = malloc(NUM_THREADS * sizeof(*tsp_threads));
        if (tsp_threads == NULL) {
            perror("TSP_THREADS ALLOC FAILURE\t");
            return (-1);
        }
        /* Create arguments for every thread in the current spawn */
        tsp_threads_args = malloc(NUM_THREADS * sizeof(*tsp_threads_args));
        if (tsp_threads_args == NULL) {
            perror("TSP THREADS ARGS ALLOCA FAILURE\t");
            return (-1);
        }
        /* Initialize arguments and create threads */
        for (tsp_threads_iter = 0; tsp_threads_iter < NUM_THREADS;
                tsp_threads_iter++) {
            if (cursor >= buf_f1.st_size) {
                break;
            }
            tsp_threads_args[tsp_threads_iter].name_f1 = name_f1;
            tsp_threads_args[tsp_threads_iter].name_f2 = name_f2;
            tsp_threads_args[tsp_threads_iter].cursor = cursor;
            thread_c_res = pthread_create(
                               &tsp_threads[tsp_threads_iter],
                               NULL,
                               check_block_dup,
                               (void*)&tsp_threads_args[tsp_threads_iter]);
            if (thread_c_res != 0) {
                perror("THREAD CREATION FAILURE");
                return (-1);
            }
            cursor+=BLOCK_SIZE;
        }
        /* Join last threads and get their status */
        while (tsp_threads_iter-->0) {
            void *thread_res = NULL;
            thread_j_res = pthread_join(tsp_threads[tsp_threads_iter],
                                        &thread_res);
            if (thread_j_res != 0) {
                perror("THREAD JOIN FAILURE");
                return (-1);
            }
            if (strcmp((char*)thread_res, "notdup")==0) {
                loop_res++;
                /* Closing other threads and exiting by condition
                 * from loop. */
                while (tsp_threads_iter-->0) {
                    pthread_cancel(tsp_threads[tsp_threads_iter]);
                }
            }
        }
        free(tsp_threads);
        free(tsp_threads_args);
        if (loop_res > 0) {
            break;
        }
    }
    return (loop_res > 0) ? 1 : 0;
}

The function works fine (at least for what I’ve tested). Still, some guys from #C (freenode) suggested that the solution is overly complicated, and it may perform poorly because of parallel reading on hddisk.

What I want to know:

  • Is the threaded approach flawed by default ?
  • Is fseek() so slow ?
  • Is there a way to somehow map the files to memory and then compare them ?

LATED EDIT:

Today I had some time, and I’ve followed your advices. You were right, this threaded version actually performs worse than a single threaded version, and all because of the parallel readings on hard disk.

Another thing is that I’ve written a function that uses mmap(), and until now is the optimal one. Still the biggest drawback of that function is that it fails, when the files are getting really big.

Here is the new implementation (a very brute and direct code):

#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"

/**
* Safely assures that a file is opened. 
* If cannot open file, the flow of the program is interrupted.
* The error code returned is -1.
**/
FILE *safe_fopen(const char *fname, const char *mode)
{
    FILE *f = NULL;
    f = fopen(fname, mode);
    if (f == NULL) {
        char emsg[1024];
        sprintf(emsg, "Cannot open file: %s\t", fname);
        perror(emsg);
        exit(-1);
    }
    return (f);
}

/**
* Check if two files have the same size.
* Returns:
* -1    Error.
* 0 If they have the same size.
* 1 If the don't have the same size.
**/
int check_same_size(const char *f1_name, const char *f2_name, off_t *f1_size, off_t *f2_size)
{
    struct stat f1_stat, f2_stat;
    if((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_same_size].\n");
        return (-1);
    }
    if((stat(f1_name, &f1_stat) != 0) || (stat(f2_name, &f2_stat) !=0)){
        fprintf(stderr, "Cannot apply stat. [check_same_size].\n");
        return (-1);
    }
    if(f1_size != NULL){
        *f1_size = f1_stat.st_size;
    }
    if(f2_size != NULL){
        *f2_size = f2_stat.st_size;
    }
    return (f1_stat.st_size == f2_stat.st_size) ? 0 : 1;
}

/**
* Test if two files are duplicates.
* Returns:
* -1    Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_plain(char *f1_name, char *f2_name, int block_size)
{
    if ((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_dup_plain].\n");
        return (-1);
    }
    FILE *f1 = NULL, *f2 = NULL;
    char f1_buff[block_size], f2_buff[block_size];
    size_t rch1, rch2;
    if(check_same_size(f1_name, f2_name, NULL, NULL) == 1){
        return (1);
    }
    f1 = safe_fopen(f1_name, "r");
    f2 = safe_fopen(f2_name, "r");
    while(!feof(f1) && !feof(f2)){
        rch1 = fread(f1_buff, 1, block_size, f1);
        rch2 = fread(f2_buff, 1, block_size, f2);
        if(rch1 != rch2){
            fprintf(stderr, "Invalid reading from file. Cannot continue. [check_dup_plain].\n");
            return (-1);
        }
        while(rch1-->0){
            if(f1_buff[rch1] != f2_buff[rch1]){
                return (1);
            }
        }
    }
    fclose(f1);
    fclose(f2);
    return (0);
}

/**
* Test if two files are duplicates.
* Returns:
* -1    Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_memmap(char *f1_name, char *f2_name)
{
    struct stat f1_stat, f2_stat;
    char *f1_array = NULL, *f2_array = NULL;
    off_t f1_size, f2_size;
    int f1_des, f2_des, cont, res;
    if((f1_name == NULL) || (f2_name == NULL)){
        fprintf(stderr, "Invalid filename passed to function [check_dup_memmap].\n");
        return (-1);    
    }
    if(check_same_size(f1_name, f2_name, &f1_size, &f2_size) == 1){
        return (1);
    }
    f1_des = open(f1_name, O_RDONLY);
    f2_des = open(f2_name, O_RDONLY);
    if((f1_des == -1) || (f2_des == -1)){
        perror("Cannot open file");
        exit(-1);       
    }
    f1_array = mmap(0, f1_size * sizeof(*f1_array), PROT_READ, MAP_SHARED, f1_des, 0);
    if(f1_array == NULL){
        fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
        return (-1);
    }
    f2_array = mmap(0, f2_size * sizeof(*f2_array), PROT_READ, MAP_SHARED, f2_des, 0);
    if(f2_array == NULL){
        fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
        return (-1);
    }
    cont = f1_size;
    res = 0;
    while(cont-->0){
        if(f1_array[cont]!=f2_array[cont]){
            res = 1;
            break;
        }
    }
    munmap((void*) f1_array, f1_size * sizeof(*f1_array));
    munmap((void*) f2_array, f2_size * sizeof(*f2_array));
    return res;
}

int main(int argc, char *argv[])
{
    printf("result: %d\n",check_dup_memmap("f2","f1"));
    return (0);
}

I am planning now to extend this code, by re-adding the threaded functionality, but this time the reading will be on memory.

Thanks for your answers.

  • 1 1 Answer
  • 0 Views
  • 0 Followers
  • 0
Share
  • Facebook
  • Report

Leave an answer
Cancel reply

You must login to add an answer.

Forgot Password?

Need An Account, Sign Up Here

1 Answer

  • Voted
  • Oldest
  • Recent
  • Random
  1. Editorial Team
    Editorial Team
    2026-05-14T07:09:47+00:00Added an answer on May 14, 2026 at 7:09 am

    It’s hard to guess about performance without a real system to test against (for example if you’re using a solid state drive, there’s no head seek time and the cost of reading different sectors from different threads is almost zero).

    If this is running against a reasonably standard computer with regular (spinning platter) hard drives, having multiple threads contend for the part of the disk they want to read from will possibly slow things down (depending, again, on the hardware and also the size of the chunks).

    If the time it takes to compute the “sameness” of a chunk is fast compared to the time it takes to read that chunk from disk, having a separate thread will not help much since the second (or third…) thread would spend most of it’s time waiting for IO to complete anyway.

    Another factor is the cache size of the CPU. If all of the memory you’re processing at one time fits in the CPU cache, things will be much faster than if different threads cause different chunks of memory to be loaded into cache as they execute instructions.

    If you have more threads than you have CPU cores, you will just slow things down by making unnecessary context switches (since a thread needs a core to run on).

    After reading all of that, if you still think multithreading is going to help for your target system, consider one thread that does IO only, places the data in a queue, and has two or more worker threads taking data off of the queue to process. That way, you optimize disk IO and can take advantage of multiple cores to crunch the numbers.

    Steve suggested you can memory map you files on Unix. That will speed up access to the underlying data a bit by leveraging low level OS functionality (the same kind used to manage swap files). That will give you some performance improvement as the OS will handle loading the parts of the file you are working on into memory efficiently, as long as the file fits into available address space. FYI you can do the same thing on Windows.

    • 0
    • Reply
    • Share
      Share
      • Share on Facebook
      • Share on Twitter
      • Share on LinkedIn
      • Share on WhatsApp
      • Report

Sidebar

Ask A Question

Stats

  • Questions 474k
  • Answers 474k
  • Best Answers 0
  • User 1
  • Popular
  • Answers
  • Editorial Team

    How to approach applying for a job at a company ...

    • 7 Answers
  • Editorial Team

    What is a programmer’s life like?

    • 5 Answers
  • Editorial Team

    How to handle personal stress caused by utterly incompetent and ...

    • 5 Answers
  • Editorial Team
    Editorial Team added an answer If you want a subtree using FOR XML PATH, you'll… May 16, 2026 at 4:23 am
  • Editorial Team
    Editorial Team added an answer If you want to set $ties = 1 if there… May 16, 2026 at 4:23 am
  • Editorial Team
    Editorial Team added an answer What might be a possibly solution is if you create… May 16, 2026 at 4:23 am

Trending Tags

analytics british company computer developers django employee employer english facebook french google interview javascript language life php programmer programs salary

Top Members

Explore

  • Home
  • Add group
  • Groups page
  • Communities
  • Questions
    • New Questions
    • Trending Questions
    • Must read Questions
    • Hot Questions
  • Polls
  • Tags
  • Badges
  • Users
  • Help
  • SEARCH

Footer

© 2021 The Archive Base. All Rights Reserved
With Love by The Archive Base

Insert/edit link

Enter the destination URL

Or link to existing content

    No search term specified. Showing recent items. Search or use up and down arrow keys to select an item.