Basic Data Wrangling¶
Read data¶
In [15]:
%pylab inline
import json
path = '../../../pydata/ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path)]
Analysis of timezone¶
In [4]:
# Top 10
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)
Out[4]:
In [5]:
# Top10
import pandas as pd
from collections import defaultdict
def get_counts(sequence):
counts = defaultdict(int) # all value will be initilized as zero
for x in sequence:
counts[x] += 1
return counts
counts = get_counts(time_zones)
# convert dict to dataframe. then you can sort to get Top 10
counts_dt = pd.DataFrame(list(counts.items()), columns=['timezone','count'])
counts_dt.sort_values(by = 'count', ascending = False).head(10)
Out[5]:
Dataframe¶
In [8]:
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
frame = DataFrame(records)
Cleaning¶
In [16]:
# quick count
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
# NA and blank
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10].plot(kind='barh', rot=0)
Out[16]:
Aggregation¶
In [10]:
# pivot
results = Series([x.split()[0] for x in frame.a.dropna()])
# is wondows or not
cframe = frame[frame.a.notnull()]
# create array for grouping
ops_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
# so that ops_system can be used below for grouping
by_tz_os = cframe.groupby(['tz', ops_system])
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
Out[10]:
In [14]:
# aggregate on timezone only
indexer = agg_counts.sum(1).argsort()
indexer[:5]
# can be used to take entire pivot - awesome!
count_subset = agg_counts.take(indexer)[-10:]
In [17]:
# plot
count_subset.plot(kind='barh', stacked=True)
Out[17]:
In [21]:
normed_subset = count_subset.div(count_subset.sum(1),axis=0)
normed_subset.plot(kind='barh', stacked=True)
Out[21]: