ユーティリティ:pythonのWebサイトログ分析プロジェクトの完全なコード


pychram端末へ:
pip install user-agent pyyaml ua-parser

完全なコード:
from pathlib import Path
import datetime
import re
from user_agents import parse
from queue import Queue
import threading
from collections import defaultdict
#192.168.56.1 - - [18/Mar/2019:10:55:04 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"


#      (    )
def extract(line:str)->dict:
    # print(line)
    pattern = '''(?P[\d\.]{7,}) - - \[(?P[^\[\]]+)\] \"(?P[^\"]+)\" (?P\d+) (?P\d+) \"(?P[^\"]+)\" \"(?P[^\"]+)\"'''
    regex = re.compile(pattern)
    matcher = regex.match(line)
    if matcher:
        return {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}

def convers_time(src:str):
    ret= datetime.datetime.strptime(src,'%d/%b/%Y:%H:%M:%S %z')
    return ret

def convers_request(src:str):
    lst = src.split()
    ret = dict(zip(['method','url','protocol'],lst))
    return ret

def convers_useragent(src:str):
    ret = parse(src)
    return ret

ops = {\
    'datetime':convers_time,
    'request':convers_request,
    'status':int,
    'size':int,
    'useragent':convers_useragent
    }

def openfile(pathstr:str)->dict:
    path = Path(pathstr)
    if path.exists() and path.is_file():
        with open(pathstr) as f:
            for line in f:
                ret = extract(line)
                if ret:
                    yield ret
                else:
                    #TODO          
                    continue
    elif path.exists() and path.is_dir():
        for item in path.iterdir():
            if path.exists() and path.is_file():
                with open(pathstr) as f:
                    for line in f:
                        ret = extract(line)
                        if ret:
                            yield ret
                        else:
                            #TODO          
                            continue
            else:
                continue
    else:
        print(pathstr,'file not found')
        # raise ImportError



#####################################################################################################
#    
def window(src:Queue,handler,width:int=20,interval:int=20):
    if width >= interval:
        start = datetime.datetime.strptime('1970/01/01 01:01:01 +0800','%Y/%m/%d %H:%M:%S %z')
        current = datetime.datetime.strptime('1970/01/01 01:01:02 +0800','%Y/%m/%d %H:%M:%S %z')
        delta = datetime.timedelta(seconds=width - interval)

        buffer = []

        while True:
            data = src.get() #block
            if data:
                buffer.append(data)
                current = data['datetime']
            if (current - start).total_seconds() >= interval:
                ret = handler(buffer)
                print(ret)
                start = current

                #buffer   
                buffer = [i for i in buffer if i['datetime'] > current - delta]

    else:
        raise ImportError
#####################################################################################################
#      (    )---   
#    
def donothing_handler(iterable:list):
    return iterable

#       
def status_handler(iterable:list):
    d = defaultdict(lambda :0)
    for item in iterable:
        k = item['status']
        d[k] += 1
    total = sum(d.values())
    return {k:v/total*100 for k,v in d.items()}

#     
def browser_handler(iterable:list):
    ua_dict = defaultdict(lambda :0)
    for item in iterable:
        ua = item['useragent']
        key = (ua.browser.family,ua.browser.version_string)
        ua_dict[key] += 1
    return ua_dict

# gn = window(*['test.log'],handler=donothing_handler)



#   
def dispacher(src):
    queues = []
    threads = []
    def reg(handler,width,interval):
        q = Queue()
        queues.append(q)
        t = threading.Thread(target=window,args=(q,handler,width,interval))
        threads.append(t)

    def run():
        for t in threads:
            t.start()

        for x in src:
            for q in queues:
                q.put(x)

    return reg,run

if __name__ == "__main__":
    import sys
    #path = sys.argv[1]
    path = 'test.log'
    reg,run = dispacher(openfile(path))
    reg(status_handler,20,20)
    run()

実行結果:
{200: 100.0}
{200: 50.0, 403: 50.0}
{200: 50.0, 404: 50.0}

test.ロゴ素材の内容は以下の通りです.
192.168.56.1 - - [18/Mar/2019:10:55:04 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.2 - - [18/Mar/2019:10:55:14 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 403 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.3 - - [18/Mar/2019:10:55:24 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.4 - - [18/Mar/2019:10:55:34 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 404 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.6 - - [18/Mar/2019:10:55:44 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.7 - - [18/Mar/2019:10:55:54 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 403 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"