
26404 ワード

都市infoboxデータを処理し、データを審査し、整理方法を考え、データを整理します.最初の練習では、データセットの特定のフィールドのデータ型を確認します.値のタイプは、1.NoneType,値が文字列「NULL」または空の文字列「2.リスト、値が「{」で始まる3.整数型の場合、値が整数型4.浮動小数点型に変換できる場合、値が浮動小数点型に変換できる場合、整数型に変換できません.たとえば、「3.23 e+07」は浮動小数点型に変換できるため浮動小数点型として扱われるべきですが、int('3.23 e+07')はValue Error 5.strを放出し、他のすべての値を表します.
import codecs
import csv
import json
import pprint

CITIES = 'cities.csv'

FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
          "isPartOf_label", "areaCode", "populationTotal", "elevation",
          "maximumElevation", "minimumElevation", "populationDensity",
          "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]
def get_type(value):
# 1, NULL '' None
if value == 'NULL' or value == '': return type(None)
# 2, { , list()
if value.startswith('{'): return type(list())
# 3, , int
try: new_value = int(value) return type(new_value) except ValueError: pass
  # 4, , float
try: new_value = float(value) return type(new_value) except ValueError: pass
# 5, , str
return type(str()) def audit_file(filename,fields):
# fieldtypes
= {} with open(filename,'r') as data_file: reader = csv.DictReader(data_file)
for i in range(3): reader.next() # key,value set for field in FIELDS: fieldtypes[field] = set() for row in reader:
for k,v in row.items():
# data_type
= get_type(v)
# row
if k in FIELDS:
# , ,
if data_type not in fieldtypes[k]: fieldtypes[k].add(data_type) return fieldtypes
def test():
    #    ,         
    fieldtypes = audit_file(CITIES, FIELDS)


    assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
    assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
if __name__ == "__main__":

def fix_area(area):
# { , ,
if area.startswith('{'):
# {}, | area_land
= area.replace('{','').replace('}','').split('|')
# area0
= area_land[0] area1 = area_land[1]
# ,
if len(area0) > len(area1): return float(area0) else: return float(area1)
else: try:
# , , area_land
= float(area) return area_land except ValueError:
# , None area_land
= None return area_land def process_file(filename): # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra metadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to fix the area value if "areaLand" in line: line["areaLand"] = fix_area(line["areaLand"]) data.append(line) return data
def test():
   # , data
= process_file(CITIES) print "Printing three example results:" for n in range(5,8): pprint.pprint(data[n]["areaLand"]) assert data[3]["areaLand"] == None assert data[8]["areaLand"] == 55166700.0 assert data[20]["areaLand"] == 14581600.0 assert data[33]["areaLand"] == 20564500.0 if __name__ == "__main__": test()

def fix_name(name):
if name == 'NULL': name = []
# { , | ,
elif name.startswith('{'): name = name.replace('{','').replace('}','').split('|')
# ,
else: name = [name] return name def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra metadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to fix the area value if "name" in line: line["name"] = fix_name(line["name"]) data.append(line) return data
def test():
  # , data
= process_file(CITIES) print "Printing 20 results:" for n in range(20): pprint.pprint(data[n]["name"]) assert data[14]["name"] == ['Negtemiut', 'Nightmute'] assert data[9]["name"] == ['Pell City Alabama'] assert data[3]["name"] == ['Kumhari'] if __name__ == "__main__": test()

def check_loc(point,lat,longi):
# point_list
= point.split(' ') _lat = point_list[0] # _longi = point_list[1] #
# , True, False
if _lat == lat and _longi == longi: return True else: return False def process_file(filename): data = [] with open(filename, "r") as f: reader = csv.DictReader(f) #skipping the extra matadata for i in range(3): l = reader.next() # processing file for line in reader: # calling your function to check the location result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"]) if not result: print "{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"]) data.append(line) return data
def test():
# ,
assert check_loc("33.08 75.28", "33.08", "75.28") == True assert check_loc("44.57833333333333 -91.21833333333333", "44.5783", "-91.2183") == False if __name__ == "__main__": test()