import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

/usr/local/lib/python3.6/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated since IPython 4.0. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

events = pd.read_csv('./events.csv') # read events

print(events.columns.values) # get data features

['event_id' 'user_id' 'start_time' 'city' 'state' 'zip' 'country' 'lat'
 'lng' 'c_1' 'c_2' 'c_3' 'c_4' 'c_5' 'c_6' 'c_7' 'c_8' 'c_9' 'c_10' 'c_11'
 'c_12' 'c_13' 'c_14' 'c_15' 'c_16' 'c_17' 'c_18' 'c_19' 'c_20' 'c_21'
 'c_22' 'c_23' 'c_24' 'c_25' 'c_26' 'c_27' 'c_28' 'c_29' 'c_30' 'c_31'
 'c_32' 'c_33' 'c_34' 'c_35' 'c_36' 'c_37' 'c_38' 'c_39' 'c_40' 'c_41'
 'c_42' 'c_43' 'c_44' 'c_45' 'c_46' 'c_47' 'c_48' 'c_49' 'c_50' 'c_51'
 'c_52' 'c_53' 'c_54' 'c_55' 'c_56' 'c_57' 'c_58' 'c_59' 'c_60' 'c_61'
 'c_62' 'c_63' 'c_64' 'c_65' 'c_66' 'c_67' 'c_68' 'c_69' 'c_70' 'c_71'
 'c_72' 'c_73' 'c_74' 'c_75' 'c_76' 'c_77' 'c_78' 'c_79' 'c_80' 'c_81'
 'c_82' 'c_83' 'c_84' 'c_85' 'c_86' 'c_87' 'c_88' 'c_89' 'c_90' 'c_91'
 'c_92' 'c_93' 'c_94' 'c_95' 'c_96' 'c_97' 'c_98' 'c_99' 'c_100' 'c_other']

events.head() # get sample data

events.shape

(3069545, 110)

# remove unecessary attribute(just use c_1 to c_other)
pure_events = events.drop(['event_id', 'user_id', 'start_time', 'city', 'state', 'zip', 'country', 'lat', 'lng'], axis=1)

pure_events.head()

from sklearn.decomposition import PCA # use PCA to transfer multi-demension to 2-demension

pure_events = np.array(pure_events) # pandas dataframe to numpy array

pure_events[0:2]

array([[ 2.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  9.],
       [ 2.,  0.,  2.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  2.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  7.]])

np.isnan(pure_events).any() # check NaN value

True

pure_events = np.nan_to_num(pure_events) # remove NaN value

decomposed_events = PCA(n_components=2).fit_transform(pure_events) # use PCA to do decomposition

decomposed_events.shape

(3069545, 2)

decomposed_events[0:2]

array([[ -3.28156187e+01,  -2.49001607e-02],
       [ -3.47958636e+01,  -1.95290266e-02]])

plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot data

[<matplotlib.lines.Line2D at 0x112e2ce48>]

decomposed_events[:, 1].max() # check abnormal data

21614.879388701447

decomposed_events = np.delete(decomposed_events, decomposed_events[:, 1].argmax(), 0) # remove abnormal data

plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot again after removing abnormal data

[<matplotlib.lines.Line2D at 0x113ffc828>]

# only reserve data with X < 2000 and Y < 20
decomposed_events = decomposed_events[np.logical_not(np.logical_or(decomposed_events[:,0] > 2000, decomposed_events[:,1] > 20))]

plt.plot(decomposed_events[:, 0], decomposed_events[:, 1], 'r.') # plot main part

[<matplotlib.lines.Line2D at 0x1144c7208>]

	event_id	user_id	start_time	city	state	zip	country	lat	lng	c_1	...	c_93	c_other
0	684921758	3647864012	2012-10-31T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2.0	...	1.0	9.0
1	244999119	3476440521	2012-11-03T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	2.0	...	0.0	7.0
2	3928440935	517514445	2012-11-05T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	0.0	...	0.0	12.0
3	2582345152	781585781	2012-10-30T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1.0	...	0.0	8.0
4	1051165850	1016098580	2012-09-27T00:00:00.001Z	NaN	NaN	NaN	NaN	NaN	NaN	1.0	...	0.0	9.0

	c_1	c_2	c_3	c_4	c_8	...	c_93	c_other
0	2.0	0.0	2.0	0.0	0.0	...	1.0	9.0
1	2.0	0.0	2.0	0.0	0.0	...	0.0	7.0
2	0.0	0.0	0.0	0.0	0.0	...	0.0	12.0
3	1.0	0.0	2.0	1.0	0.0	...	0.0	8.0
4	1.0	1.0	0.0	0.0	2.0	...	0.0	9.0