Pythonデータ可視化プログラミング(一)


CSVファイルの読み込み
#method one
import csv
import sys
filename = 'ch02-data.csv'
data = []

try:
    with open(filename) as f:
        reader = csv.reader(f)
    header = reader.next()
    data = [row for row in reader]
except csv.Error as e:
    print"error reading csv file at line %s:%s"%(reader.line_num,e)
    sys.exit(-1)

if header:
    print header
    print"========================="
for datarow in data:
    print datarow


#method two
import numpy
data = numpy.loadtxt('ch02-data.csv',dtype = 'string',delimiter = ',')#     

Excelファイルの読み込み
import xlrd
file = 'data'
wb = xlrd.open_workbook(filename = file)
ws = wb.sheet_by_name('sheet1')
dataset = []

for r in xrange(ws.nrows):
    col = []
    for c in range(ws.ncols):
        col.append(ws.cell(r,c).value)
    dataset.append(col)

from pprint import pprint
pprint(dataset

固定幅データファイルからデータをインポートするには
import struct
import string
datafile = 'ch02-data-width-1M.data'
mask = '9s14s5s'

with open(datafile,'r') as f:
    for line in f:
        fields = struct.Struct(mask).unpack_from(line)#unpack_from()                  
        print'fields:',[field.strip() for field in fields]#strip()         

タブで区切られたファイルからデータを読み込む
import csv
import sys
filename = 'ch02-data.tab'
data = []

try:
    with open(filename) as f:
        reader = csv.reader(f,dialect = csv.excel_tab)
    header = reader.next()
    data = [row for row in reader]
except csv.Error as e:
    print"error reading csv file at line %s:%s"%(reader.line_num,e)
    sys.exit(-1)

if header:
    print header
    print"========================="
for datarow in data:
    print datarow

#        
datafile = 'data-1.tab'
with open(datafile,'r') as f:
    for line in f:
        line = line.strip()
        print line.split('\t')

JSONデータソースからデータをインポート
"""
 JSON       
pip install requests
"""
import requests
url = 'https://github.com/timeline.json'
r = requests.get(url)#      
json_obj = r.json()
repos = set()

for entry in json_obj:
  try:
      repos.add(entry['repository']['url'])
  except KeyError as e:
      print "no key %s.skipping......"%(e)

from pprint import pprint
pprint(repos)

JSON、CSV、Excelへのデータのエクスポート

"""
     JSON、CSV、Excel
"""
#first,import the moudle we need
import os
import sys
import argparse
try:
    import cStringIO as StringIO
except:
    import StringIO
import struct
import json
import csv

#second,define the method to     
def import_data(import_file):
    mask = '9s14s5s'
    data = []
    with open(import_file,'r') as f:
        for line in f:
            fields = struct.Struct(mask).unpack_from(line)
            data.append(list([f.strip() for f in fields]))
    return data

def write_data(data,export_format):
    if export_format == 'csv':
        return write_csv(data)
    elif export_format == 'json':
        return write_json(data)
    elif export_format == 'xlsx':
        return write_xlsx(data)    
    else:
        raise Exception('Illegal format defined')

#third,              
def write_csv(data):
    f = StringIO.StringIO()
    writer = csv.writer(f)
    for row in data:
        writer.writerow(row)
    return f.getvalue()

def write_json(data):
    j = json.dumps(data)
    return j

def write_xlsx(data):
    from xlwt import Workbook
    book = Workbook()
    sheet1 = book.add_sheet('Sheet 1')
    row = 0
    for line in data:
        col = 0
        for datum in line:
            print datum
            sheet1.write(row,col,datum)
            col += 1
        row += 1
        if row > 65535:
            print >> sys.stderr
            break
    f = StringIO.StringIO()
    book.save(f)
    return f.getvalue()

#forth,main  
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('import-file',help = 'Path to a fixed-width data file.')
    parser.add_argument('export_format',help = 'Export format: json, csv, xlsx')
    args = parser.parse_args()
    if args.import_file is None:
        print >> sys.stderr
        sys.exit(1)
    if args.export_format not in ('csv', 'xlsx', 'json'):
        print >> sys.stderr
        sys.exit(1)
    if not os.path.isfile(args.import_file):
        print >> sys.stderr
        sys.exit(1)
    datda = import_data(args.import_file)
    print write_data(data,args.export_format)

HTMLからのデータのインポート
#  HTML       
from lxml.html import parse  
from urllib2 import urlopen  
parsed = parse(urlopen("https://finance.yahoo.com/q/op?s=AAPL+Options"))  
#         ,     。  
doc = parsed.getroot()  
table = doc.findall(".//table")  
#            。  
put = table[1]  
#         ,        。 HTML th         ,td      。  
def _unpack(row, kind="td"):  
    elts = row.findall(".//%s" % kind)  
    return [val.text_content() for val in elts]  
#   ,        ,         。    pandas  TextParser       。  
from pandas.io.parsers import TextParser  
def parse_options_data(table):  
    rows = table.findall(".//tr")  
    header = _unpack(rows[0], kind="th")  
    data = [_unpack(r) for r in rows[1:]]  
    return TextParser(data, names=header).get_chunk()  
#                 
put_data = parse_options_data(put)  
print put_data[:10]  
#   ,            URL  
#       a。  
links = doc.findall(".//a")  
# print links[15:20]  
#        URL         ,get() text_content()    
urls = [lnk.get("href") for lnk in links]  
text = [lnk.text_content() for lnk in links]  
from pprint import pprint  
pprint(urls[:10])  
print “============”  

データベースからデータをインポート
"""
        
pip install sqlite3
"""
#first, SQL     SQLite    
import sqlite3
import sys
if len(sys.argv) < 2:
    print'error: you must supply at least SQL script'
    print'usage: %s tabel.db ./sql-dump.sql'%(sys.argv[0])
    sys.exit(1)
script_path = sys.argv[1]
if len(sys.argv) == 3:
    db = sys.argv[2]
else:
    db = ':memory:'
try:
    con = sqlite3.connect(db)
    with con:
        cur = con.cursor()
        with open(script_path,'rb') as f:
            cur.executescript(f.read())
except sqlite3.Error as err:
    print'Error occured: %s'%err

#             
import sqlite3
import sys
if len(sys.argv) != 2:
    print"please specify database file.'
    sys.exit(1)
db = sys.argv[1]
try:
    con = sqlite3.connect(db)
    with con:
        cur = con.cursor()
        query = 'SELECT ID,Name,Population FROM City ORDER BY Population DESC LIMIT 1000'
        con.text_factory = str
        cur.execute(query)
        resultset = cur.fetchall()
        col_names = [cn[0] for cn in cur.description]
        print'%10s %30s %10s'%tuple(col_names)
        print'='*(10+1+30+1+10)
        for row in resultset:
            print'%10s %30s %10s'%row
except sqlite3.Error as err:
    print'[ERROR]:',err