I’m working on some text conversion routines that parse time values in different formats in Ruby.
This routine is growing in complexity, and I’m currently testing a better approach to this problem.
I’m currently testing a way to use scanf. Why? I always thought that was faster than a regex, but what happened in Ruby? It was much slower!
What am I doing wrong?
Note: I’m using ruby-1.9.2-p290 [ x86_64 ] (MRI)
First Ruby test:
require "scanf"
require 'benchmark'
def duration_in_seconds_regex(duration)
if duration =~ /^\d{2,}\:\d{2}:\d{2}$/
h, m, s = duration.split(":").map{ |n| n.to_i }
h * 3600 + m * 60 + s
end
end
def duration_in_seconds_scanf(duration)
a = duration.scanf("%d:%d:%d")
a[0] * 3600 + a[1] * 60 + a[2]
end
n = 500000
Benchmark.bm do |x|
x.report { for i in 1..n; duration_in_seconds_scanf("00:10:30"); end }
end
Benchmark.bm do |x|
x.report { for i in 1..n; duration_in_seconds_regex("00:10:30"); end }
end
This is what I got using scanf first and a regex second:
user system total real
95.020000 0.280000 95.300000 ( 96.364077)
user system total real
2.820000 0.000000 2.820000 ( 2.835170)
Second test using C:
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <sys/types.h>
#include <string.h>
#include <regex.h>
char *regexp(char *string, char *patrn, int *begin, int *end) {
int i, w = 0, len;
char *word = NULL;
regex_t rgT;
regmatch_t match;
regcomp(&rgT, patrn, REG_EXTENDED);
if ((regexec(&rgT, string, 1, &match, 0)) == 0) {
*begin = (int) match.rm_so;
*end = (int) match.rm_eo;
len = *end - *begin;
word = malloc(len + 1);
for (i = *begin; i<*end; i++) {
word[w] = string[i];
w++;
}
word[w] = 0;
}
regfree(&rgT);
return word;
}
int main(int argc, char** argv) {
char * str = "00:01:30";
int h, m, s;
int i, b, e;
float start_time, end_time, time_elapsed;
regex_t regex;
regmatch_t * pmatch;
char msgbuf[100];
char *pch;
char *str2;
char delims[] = ":";
char *result = NULL;
start_time = (float) clock() / CLOCKS_PER_SEC;
for (i = 0; i < 500000; i++) {
if (sscanf(str, "%d:%d:%d", &h, &m, &s) == 3) {
s = h * 3600L + m * 60L + s;
}
}
end_time = (float) clock() / CLOCKS_PER_SEC;
time_elapsed = end_time - start_time;
printf("sscanf_time (500k iterations): %.4f", time_elapsed);
start_time = (float) clock() / CLOCKS_PER_SEC;
for (i = 0; i < 500000; i++) {
char * match = regexp(str, "[0-9]{2,}:[0-9]{2}:[0-9]{2}", &b, &e);
if (strcmp(match, str) == 0) {
str2 = (char*) malloc(sizeof (str));
strcpy(str2, str);
h = strtok(str2, delims);
m = strtok(NULL, delims);
s = strtok(NULL, delims);
s = h * 3600L + m * 60L + s;
}
}
end_time = (float) clock() / CLOCKS_PER_SEC;
time_elapsed = end_time - start_time;
printf("\n\nregex_time (500k iterations): %.4f", time_elapsed);
return (EXIT_SUCCESS);
}
The C code results are obviously faster, and the regex results are slower than scanf results as expected:
sscanf_time (500k iterations): 0.1774
regex_time (500k iterations): 3.9692
It is obvious that the C running time is faster, so please don’t comment that Ruby is interpreted and stuff like that please.
This is the related gist.
The problem is not that it’s interpreted, but that everything in Ruby is an object. You can explore “scanf.rb” in your Ruby distribution and compare it to scanf implementation in C.
Ruby implementation of scanf based on RegExp matching. Every atom like “%d” is an object in ruby, while it’s only one case item in C. So, to my mind, the reason of such execution time is lots of object allocation/deallocation.