In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline
/usr/local/lib/python3.6/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
In [2]:
events = pd.read_csv('./events.csv') # read events
In [3]:
print(events.columns.values) # get data features
['event_id' 'user_id' 'start_time' 'city' 'state' 'zip' 'country' 'lat'
 'lng' 'c_1' 'c_2' 'c_3' 'c_4' 'c_5' 'c_6' 'c_7' 'c_8' 'c_9' 'c_10' 'c_11'
 'c_12' 'c_13' 'c_14' 'c_15' 'c_16' 'c_17' 'c_18' 'c_19' 'c_20' 'c_21'
 'c_22' 'c_23' 'c_24' 'c_25' 'c_26' 'c_27' 'c_28' 'c_29' 'c_30' 'c_31'
 'c_32' 'c_33' 'c_34' 'c_35' 'c_36' 'c_37' 'c_38' 'c_39' 'c_40' 'c_41'
 'c_42' 'c_43' 'c_44' 'c_45' 'c_46' 'c_47' 'c_48' 'c_49' 'c_50' 'c_51'
 'c_52' 'c_53' 'c_54' 'c_55' 'c_56' 'c_57' 'c_58' 'c_59' 'c_60' 'c_61'
 'c_62' 'c_63' 'c_64' 'c_65' 'c_66' 'c_67' 'c_68' 'c_69' 'c_70' 'c_71'
 'c_72' 'c_73' 'c_74' 'c_75' 'c_76' 'c_77' 'c_78' 'c_79' 'c_80' 'c_81'
 'c_82' 'c_83' 'c_84' 'c_85' 'c_86' 'c_87' 'c_88' 'c_89' 'c_90' 'c_91'
 'c_92' 'c_93' 'c_94' 'c_95' 'c_96' 'c_97' 'c_98' 'c_99' 'c_100' 'c_other']
In [4]:
events.head() # get sample data
Out[4]:
event_id user_id start_time city state zip country lat lng c_1 ... c_92 c_93 c_94 c_95 c_96 c_97 c_98 c_99 c_100 c_other
0 684921758 3647864012 2012-10-31T00:00:00.001Z NaN NaN NaN NaN NaN NaN 2.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0
1 244999119 3476440521 2012-11-03T00:00:00.001Z NaN NaN NaN NaN NaN NaN 2.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
2 3928440935 517514445 2012-11-05T00:00:00.001Z NaN NaN NaN NaN NaN NaN 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0
3 2582345152 781585781 2012-10-30T00:00:00.001Z NaN NaN NaN NaN NaN NaN 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0
4 1051165850 1016098580 2012-09-27T00:00:00.001Z NaN NaN NaN NaN NaN NaN 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0

5 rows × 110 columns

In [5]:
events.shape
Out[5]:
(3069545, 110)
In [6]:
# remove unecessary attribute(just use c_1 to c_other)
pure_events = events.drop(['event_id', 'user_id', 'start_time', 'city', 'state', 'zip', 'country', 'lat', 'lng'], axis=1)
In [7]:
pure_events.head()
Out[7]:
c_1 c_2 c_3 c_4 c_5 c_6 c_7 c_8 c_9 c_10 ... c_92 c_93 c_94 c_95 c_96 c_97 c_98 c_99 c_100 c_other
0 2.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0
1 2.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7.0
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 12.0
3 1.0 0.0 2.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8.0
4 1.0 1.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9.0

5 rows × 101 columns

In [8]:
from sklearn.decomposition import PCA # use PCA to transfer multi-demension to 2-demension
In [9]:
pure_events = np.array(pure_events) # pandas dataframe to numpy array
In [10]:
pure_events[0:2]
Out[10]:
array([[ 2.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  9.],
       [ 2.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.]])
In [13]:
np.isnan(pure_events).any() # check NaN value
Out[13]:
True
In [14]:
pure_events = np.nan_to_num(pure_events) # remove NaN value
In [15]:
decomposed_events = PCA(n_components=2).fit_transform(pure_events) # use PCA to do decomposition
In [16]:
decomposed_events.shape
Out[16]:
(3069545, 2)
In [17]:
decomposed_events[0:2]
Out[17]:
array([[ -3.28156187e+01,  -2.49001607e-02],
       [ -3.47958636e+01,  -1.95290266e-02]])
In [18]:
plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot data
Out[18]:
[<matplotlib.lines.Line2D at 0x112e2ce48>]
In [19]:
decomposed_events[:, 1].max() # check abnormal data
Out[19]:
21614.879388701447
In [20]:
decomposed_events = np.delete(decomposed_events, decomposed_events[:, 1].argmax(), 0) # remove abnormal data
In [21]:
plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot again after removing abnormal data
Out[21]:
[<matplotlib.lines.Line2D at 0x113ffc828>]
In [25]:
# only reserve data with X < 2000 and Y < 20
decomposed_events = decomposed_events[np.logical_not(np.logical_or(decomposed_events[:,0] > 2000, decomposed_events[:,1] > 20))]
In [27]:
plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot main part
Out[27]:
[<matplotlib.lines.Line2D at 0x1144c7208>]
In [ ]: