pythonでapacheなどのwebログを分析する
3933 ワード
1ログのpythonフレームワークawkを分析する.py
統計ログのクリック数
count_line.py
使用方法はshellで実行
# cat apachelog.log|python count_lines.py
統計閲覧回数がn回を超える訪問者visitors.py
How many people have returned to the site more than N times?
# cat apachelog.log|python visitors.pyドメイン名に従ってアクセス量domainを統計する.py
# cat apachelog.log|python domain.py
#
# Custom awk.py module
#
class controller:
def __init__(self, f):
self.m_file = f
self.m_handlers = []
def subscribe(self, o):
self.m_handlers.append(o)
def run(self):
for o in self.m_handlers:
o.begin()
s = self.m_file.readline()
while s != "":
for o in self.m_handlers:
o.process_line(s)
s = self.m_file.readline()
for o in self.m_handlers:
o.end()
def print_results(self):
print
print "Results:"
print
for o in self.m_handlers:
print "------------------------------------------------------"
print o.description()
print "------------------------------------------------------"
print o.result()
統計ログのクリック数
count_line.py
# Standard sys module
import sys
# Custom awk.py module
import awk
class count_lines:
def begin(self):
self.m_count = 0
def process_line(self, s):
self.m_count += 1
def end(self):
pass
def description(self):
return "# of lines in the file"
def result(self):
return self.m_count
#
# Step 1: Create the Awk controller
#
ac = awk.controller(sys.stdin)
#
# Step 2: Subscribe the handler
#
ac.subscribe(count_lines())
#
# Step 3: Run
#
ac.run()
#
# Step 4: Print the results
#
ac.print_results()
使用方法はshellで実行
# cat apachelog.log|python count_lines.py
統計閲覧回数がn回を超える訪問者visitors.py
How many people have returned to the site more than N times?
import re;
import sys
imort awk
class return_visitors:
def __init__(self, n):
self.m_n = n;
self.m_ip_days = {};
def begin(self):
pass;
def process_line(self, s):
try:
array = s.split();
ip = array[0];
day = array[3][1:7];
if self.m_ip_days.has_key(ip):
if day not in self.m_ip_days[ip]:
self.m_ip_days[ip].append(day);
else:
self.m_ip_days[ip] = [];
self.m_ip_days[ip].append(day);
except IndexError:
pass;
def end(self):
ips = self.m_ip_days.keys();
count = 0;
for ip in ips:
if len(self.m_ip_days[ip]) > self.m_n:
count += 1;
self.m_count = count;
def description(self):
return "# of IP addresses that visited more than %s days" % self.m_n;
def result(self):
return self.m_count;
ac = awk.controller(sys.stdin)
ac.subscribe(return_visitors(2))
ac.run()
ac.print_results()
# cat apachelog.log|python visitors.pyドメイン名に従ってアクセス量domainを統計する.py
import re;
import sys
imort awk
class referring_domains:
def __init__(self):
self.m_domains = {};
def begin(self):
pass;
def process_line(self, line):
try:
array = line.split();
referrer = array[10];
m = re.search('//[a-zA-Z0-9\-\.]*\.[a-zA-z]{2,3}/',
referrer);
length = len(m.group(0));
domain = m.group(0)[2:length-1];
if self.m_domains.has_key(domain):
self.m_domains[domain] += 1;
else:
self.m_domains[domain] = 1;
except AttributeError:
pass;
except IndexError:
pass;
def end(self):
pass;
def description(self):
return "Referring domains";
def sort(self, key1, key2):
if self.m_domains[key1] > self.m_domains[key2]:
return -1;
elif self.m_domains[key1] == self.m_domains[key2]:
return 0;
else:
return 1;
def result(self):
s = "";
keys = self.m_domains.keys();
keys.sort(self.sort);
for domain in keys:
s += domain;
s += " ";
s += str(self.m_domains[domain]);
s += "
";
s += "
";
return s;
ac = awk.controller(sys.stdin)
ac.subscribe(referring_domains())
ac.run()
ac.print_results()
# cat apachelog.log|python domain.py