Recently I’m dealing with the access log of nginx analyzing work with python. I’ve

Question

0

Editorial Team

Asked: June 1, 20262026-06-01T19:54:55+00:00 2026-06-01T19:54:55+00:00

Recently I’m dealing with the access log of nginx analyzing work with python. I’ve

0

Recently I’m dealing with the access log of nginx analyzing work with python.

I’ve found the way to split the quoted string by space using shlex according to this

But it’s really slow, analyzing 2000 lines of logs costs more than 1.2 seconds. My nginx server generates more than 2500 lines per sec.

So I’ve tried with re or more native (and rude) way with indices the string.

The codes are running in a virtual machine and both cost about more than 0.5 seconds for 2000 lines of logs

Do I have any other choice to make it more efficient?

Thanks in advance

Here’s my code

import re
import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'
def convert(line):
    line = re.split('\"', line)
    line_pre = re.split('\s+', line[0])

    r =re.compile(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$")
    http_method =r.findall(line[1])
    #http_method =re.findall(r"^((?:GET|POST|OPTIONS))\s+?.*HTTP\/1\.[10]$", line[1])
    if len(http_method):
        http_method = http_method[0]
    else:
        http_method = ''
    r = re.compile(r"^\s+(\d{1,3})\s+(\d+)")
    code_byte = r.findall(line[2])
    #code_byte = re.findall(r"^\s+(\d{1,3})\s+(\d+)", line[2])
    status = int(code_byte[0][0])
    bytes_sent = int(code_byte[0][1])
    r = re.compile(r":\d+$")
    upstream_addr = r.sub("", line_pre[4])
    request_time = int(float(line_pre[0])*1000)
    if line_pre[1] == '-':
        upstream_response_time = -1
    else:
        upstream_response_time = int(float(line_pre[1])*1000)
    remote_addr = line_pre[2]
    host = line_pre[7].replace(' ','')
    logdatetime = line_pre[5].replace('[','')
    dt = datetime.datetime.strptime(logdatetime, "%d/%b/%Y:%H:%M:%S")
    year = int(str(dt)[0:4])
    monthday = int(str(dt)[4:10].replace("-",""))
    hour = int(str(dt)[11:13])
    logtime = int(str(dt)[14:16])
    sec = time.mktime(dt.timetuple())
    r = re.compile(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$")
    request_uri = r.findall(line[1])
    #request_uri = re.findall(r"^[A-Z]+\s+?(.*)HTTP\/1\.[10]$", line[1])
    http_referer = line[3]
    user_agent = line[5]
    gzip_ratio = line[7]
    http_x_forwarded_for = line[9]
    r = re.compile(r"^([0-9\.]+)\s+(.*)")
    serad_guid = r.findall(line[11])
    server_addr = serad_guid[0][0]
    guid = serad_guid[0][1]
    doc = {
                    "hour":hour,
                    "year":year,
                    "date":monthday,
                    "time":logtime,
                    "sec":sec,
                    "request_time":request_time,
                    "upstream_response_time":upstream_response_time,
                    "remote_addr":remote_addr,
                    "upstream_addr":upstream_addr,
                    "host":host,
                    "method":http_method,
                    "request_uri":request_uri,
                    #"request_protocal":"",
                    "status":status,
                    "bytes_sent":bytes_sent,
                    "http_referer":http_referer,
                    "user_agent":user_agent,
                    "gzip_ratio":gzip_ratio,
                    "http_x_forwarded_for":http_x_forwarded_for,
                    "server_addr":server_addr,
                    "guid":guid

    }
    return doc
t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

and

indices way

import time
import datetime
line = '0.278 0.264 113.116.52.174 - 10.10.3.41:20080  [08/Apr/2012:23:59:08 +0800] shenzhen.anjuke.com "GET /ajax/propext/?proid=104178677&commid=97047&brokerid=262646&rand=0.905798233768953 HTTP/1.0" 200 10914 "http://shenzhen.anjuke.com/prop/view/104178677" "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; 360SE)" "-" "-" - "114.80.230.198 026AF756-C5DF-3EDA-B96A-F31A39F7C65E"'

def pair(l):
    for i in range(0, len(l), 2):
        yield (l[i], l[i+1])

def convert(line):
    line = line.replace("  ", "")
    quotes_positions = allindices(line, "\"")
    if len(quotes_positions) <= 0 or len(quotes_positions)% 2 != 0:
        return None

    space_positions = allindices(line, " ")

    target_positions = []

    for s in space_positions:
        true_target = True
        for qs, qe in pair(quotes_positions):
            if s > qs and s < qe:
                true_target = False
                break
        if true_target:
            target_positions.append(s)

    ret = []
    for i in range(0, len(target_positions)):
        if i + 1 == len(target_positions):
            ret.append(line[target_positions[i] + 1:])
        else:
            ret.append(line[target_positions[i] + 1:target_positions[i + 1]])
    return ret


# def allindices(string, sub, listindex=[], offset=0):
def allindices(string, sub):
    listindex = list()
    i = string.find(sub)
    while i >= 0:
        listindex.append(i)
        i = string.find(sub, i + 1)
    return listindex

t2 = time.time()
count =0
for i in range(12000):
    convert(line)
    count += 1
    if count % 2000 == 0:
    t1 = t2
        t2 = time.time()
        print str(t2-t1)

Report

Leave an answer
Cancel reply

You must login to add an answer.

Need An Account,

1 Answer

Editorial Team · Answer 1 · 2026-06-01T19:54:57+00:00

Just wrote a regex based on the sample line, I don’t actually know the meaning of some fields so I used placeholder names for them, you can rename them to more meaningful ones. On my machine this snippet is 4~5 times faster than your first one.

log_line_re = re.compile(
r"""
(?P<float1>[0-9.]+)
\s
(?P<float2>[0-9.]+)
\s
(?P<ip1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<field1>.+?)
\s
(?P<ip_port_1>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})
\s+
\[(?P<request_date>.+?)\]
\s
(?P<host>.+?)
\s
"
(?P<http_method>[A-Z]+)
\s
(?P<request_path>.+?)
\s
HTTP/(?P<http_version>[0-9.]+)
"
\s
(?P<status_code>\d{3})
\s
(?P<number>\d+)
\s
"
(?P<referer>.+?)
"
\s
"(?P<user_agent>.+?)"
\s
"(?P<field2>.+?)"
\s
"(?P<field3>.+?)"
\s
(?P<field4>.+?)
"
(?P<ip2>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})
\s
(?P<request_guid>.+?)
"
""", re.VERBOSE)


def convert(line):
    return log_line_re.match(line).groupdict()

Sign Up

Sign In

Forgot Password

The Archive Base Latest Questions

Recently I’m dealing with the access log of nginx analyzing work with python. I’ve

Leave an answerCancel reply

1 Answer

Leave an answer
Cancel reply