pythonでapacheなどのwebログを分析する

3933 ワード

1ログのpythonフレームワークawkを分析する.py
#
# Custom awk.py module
#


class controller:

    def __init__(self, f):
        self.m_file = f
        self.m_handlers = []


    def subscribe(self, o):
        self.m_handlers.append(o)

    def run(self):

        for o in self.m_handlers:
            o.begin()

        s = self.m_file.readline()

        while s != "":

            for o in self.m_handlers:
                o.process_line(s)

            s = self.m_file.readline()


        for o in self.m_handlers:
            o.end()


    def print_results(self):

        print
        print "Results:"
        print

        for o in self.m_handlers:
            print "------------------------------------------------------"
            print o.description()
            print "------------------------------------------------------"
            print o.result()

統計ログのクリック数
count_line.py
# Standard sys module
import sys

# Custom awk.py module
import awk

class count_lines:

	def begin(self):
		self.m_count = 0

	def process_line(self, s):
		self.m_count += 1

	def end(self):
		pass

	def description(self):
		return "# of lines in the file"

	def result(self):
		return self.m_count


#
# Step 1: Create the Awk controller
#
ac = awk.controller(sys.stdin)

#
# Step 2: Subscribe the handler
#
ac.subscribe(count_lines())

#
# Step 3: Run
#
ac.run()

#
# Step 4: Print the results
#
ac.print_results()

使用方法はshellで実行
# cat apachelog.log|python count_lines.py
統計閲覧回数がn回を超える訪問者visitors.py
How many people have returned to the site more than N times?
import re;
import sys
imort awk

class return_visitors:

	def __init__(self, n):
		self.m_n = n;
		self.m_ip_days = {};

	def begin(self):
	    pass;

	def process_line(self, s):

		try:
			array = s.split();
			ip = array[0];
			day = array[3][1:7];

			if self.m_ip_days.has_key(ip):

				if day not in self.m_ip_days[ip]:
					self.m_ip_days[ip].append(day);

			else:
				self.m_ip_days[ip] = [];
				self.m_ip_days[ip].append(day);

		except IndexError:
			pass;



	def end(self):

		ips = self.m_ip_days.keys();
		count = 0;

		for ip in ips:

			if len(self.m_ip_days[ip]) > self.m_n:
				count += 1;

		self.m_count = count;


	def description(self):
		return "# of IP addresses that visited more than %s days" % self.m_n;

	def result(self):
		return self.m_count;
ac = awk.controller(sys.stdin)
ac.subscribe(return_visitors(2))
ac.run()
ac.print_results()

# cat apachelog.log|python visitors.pyドメイン名に従ってアクセス量domainを統計する.py
import re;
import sys
imort awk

class referring_domains:

	def __init__(self):
		self.m_domains = {};

	def begin(self):
		pass;

	def process_line(self, line):

		try:
			array = line.split();
			referrer = array[10];

			m = re.search('//[a-zA-Z0-9\-\.]*\.[a-zA-z]{2,3}/',
				      referrer);

			length = len(m.group(0));
			domain = m.group(0)[2:length-1];

			if self.m_domains.has_key(domain):
				self.m_domains[domain] += 1;
			else:
				self.m_domains[domain] = 1;

		except AttributeError:
			pass;
		except IndexError:
			pass;


	def end(self):
		pass;


	def description(self):
		return "Referring domains";


	def sort(self, key1, key2):
		if self.m_domains[key1] > self.m_domains[key2]:
			return -1;
		elif self.m_domains[key1] == self.m_domains[key2]:
			return 0;
		else:
			return 1;


	def result(self):

		s = "";
		keys = self.m_domains.keys();
		keys.sort(self.sort);

		for domain in keys:
			s += domain;
			s += " ";
			s += str(self.m_domains[domain]);
			s += "
"; s += "

"; return s; ac = awk.controller(sys.stdin) ac.subscribe(referring_domains()) ac.run() ac.print_results()

# cat apachelog.log|python domain.py