Basic Data Wrangling¶

Read data¶

In [15]:

%pylab inline
import json
path = '../../../pydata/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]

Populating the interactive namespace from numpy and matplotlib

Analysis of timezone¶

In [4]:

# Top 10
time_zones = [rec['tz'] for rec in records if 'tz' in rec]

from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)

Out[4]:

[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

In [5]:

# Top10

import pandas as pd
from collections import defaultdict
def get_counts(sequence):
    counts = defaultdict(int) # all value will be initilized as zero
    for x in sequence:
        counts[x] += 1
    return counts

counts = get_counts(time_zones)
# convert dict to dataframe. then you can sort to get Top 10
counts_dt = pd.DataFrame(list(counts.items()), columns=['timezone','count'])
counts_dt.sort_values(by = 'count', ascending = False).head(10)

Out[5]:

	timezone	count
63	America/New_York	1251
0		521
96	America/Chicago	400
41	America/Los_Angeles	382
93	America/Denver	191
31	Europe/London	74
68	Asia/Tokyo	37
77	Pacific/Honolulu	36
53	Europe/Madrid	35
88	America/Sao_Paulo	33

Dataframe¶

In [8]:

from pandas import DataFrame, Series
import pandas as pd
import numpy as np

frame = DataFrame(records)

Cleaning¶

In [16]:

# quick count
tz_counts = frame['tz'].value_counts()
tz_counts[:10]

# NA and blank
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()

tz_counts[:10].plot(kind='barh', rot=0)

Out[16]:

<matplotlib.axes._subplots.AxesSubplot at 0x105236f60>

Aggregation¶

In [10]:

# pivot
results = Series([x.split()[0] for x in frame.a.dropna()])
# is wondows or not
cframe = frame[frame.a.notnull()]
# create array for grouping
ops_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
# so that ops_system can be used below for grouping
by_tz_os = cframe.groupby(['tz', ops_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

Out[10]:

	Not Windows	Windows
tz
	245.0	276.0
Africa/Cairo	0.0	3.0
Africa/Casablanca	0.0	1.0
Africa/Ceuta	0.0	2.0
Africa/Johannesburg	0.0	1.0
Africa/Lusaka	0.0	1.0
America/Anchorage	4.0	1.0
America/Argentina/Buenos_Aires	1.0	0.0
America/Argentina/Cordoba	0.0	1.0
America/Argentina/Mendoza	0.0	1.0

In [14]:

# aggregate on timezone only
indexer = agg_counts.sum(1).argsort()
indexer[:5]

# can be used to take entire pivot - awesome!
count_subset = agg_counts.take(indexer)[-10:]

In [17]:

# plot
count_subset.plot(kind='barh', stacked=True)

Out[17]:

<matplotlib.axes._subplots.AxesSubplot at 0x10527ff98>

In [21]:

normed_subset = count_subset.div(count_subset.sum(1),axis=0)
normed_subset.plot(kind='barh', stacked=True)

Out[21]:

<matplotlib.axes._subplots.AxesSubplot at 0x1054898d0>

Use Pandas to handle access log

Basic Data Wrangling¶

Read data¶

Analysis of timezone¶

Dataframe¶

Cleaning¶

Aggregation¶