scikit-learn
from IPython.display import YouTubeVideo
YouTubeVideo('2lpS6gUwiJQ')
scikit-learn
and friends numpy
and bokeh
,Links:
scikit-learn
and friendsscikit-learn
is becoming the de-facto machine learning library for Pythonscipy
and numpy
pandas
and bokeh
File comes from here: http://figshare.com/articles/reddit_user_posting_behavior/874101
%%bash
head reddit_user_posting_behavior.csv
import pandas as pd
pd.read_csv("reddit_user_posting_behavior.csv", nrows=10, names=["user"]+range(25)).fillna("")
%%time
user_ids = []
subreddit_ids = []
subreddit_to_id = {}
i=0
with open("reddit_user_posting_behavior.csv", 'r') as f:
for line in f:
for sr in line.rstrip().split(",")[1:]:
if sr not in subreddit_to_id:
subreddit_to_id[sr] = len(subreddit_to_id)
user_ids.append(i)
subreddit_ids.append(subreddit_to_id[sr])
i+=1
import numpy as np
from scipy.sparse import csr_matrix
rows = np.array(subreddit_ids)
cols = np.array(user_ids)
data = np.ones((len(user_ids),))
num_rows = len(subreddit_to_id)
num_cols = i
# the code above exists to feed this call
adj = csr_matrix( (data,(rows,cols)), shape=(num_rows, num_cols) )
print adj.shape
print ""
# now we have our matrix, so let's gather up a bit of info about it
users_per_subreddit = adj.sum(axis=1).A1
subreddits = range(len(subreddit_to_id))
for sr in subreddit_to_id:
subreddits[subreddit_to_id[sr]] = sr
subreddits = np.array(subreddits)
Our adjacency matrix is a bit problematic to deal with as-is:
scikit-learn
has a decomposition packageTruncatedSVD
scikit-learn
%%time
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
svd = TruncatedSVD(n_components=100)
embedded_coords = normalize(svd.fit_transform(adj), norm='l1')
print embedded_coords.shape
The output is kind of neat:
%matplotlib inline
pd.DataFrame(np.cumsum(svd.explained_variance_ratio_)).plot(figsize=(13, 8))
# this function will show you the axes on which a particular subreddit scores the highest/lowest
def pickOutSubreddit(sr):
sorted_axes = embedded_coords[list(subreddits).index(sr)].argsort()[::-1]
return pd.DataFrame(subreddits[np.argsort(embedded_coords[:,sorted_axes], axis=0)[::-1]], columns=sorted_axes)
pickOutSubreddit("soccer")
pd.DataFrame(subreddits[np.argsort(embedded_coords[:,[0, 1, 44,51,84,50,47,40]], axis=0)[::-1]],
columns=[
"0: big - small",
"1: big - small",
"44: soccer - guns",
"51: programming - food",
"84: music - bikes",
"50: osx - books",
"47: wow - starcraft",
"40: male grooming - life hacks"
])
# not shown but also amusing:
# 14: music - pot
# 24: science - porn
import bokeh.plotting as bp
from bokeh.objects import HoverTool
bp.output_notebook()
row_selector = np.where(users_per_subreddit>100)
bp.figure(plot_width=900, plot_height=700, title="Subreddit Map by Most Informative Dimensions",
x_axis_label = "Dimension 0",
y_axis_label = "Dimension 1",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
min_border=1)
bp.scatter(
x = embedded_coords[:,0][row_selector],
y = embedded_coords[:,1][row_selector],
radius= np.log2(users_per_subreddit[row_selector])/6000,
source=bp.ColumnDataSource({"subreddit": subreddits[row_selector]})
).select(dict(type=HoverTool)).tooltips = {"/r/":"@subreddit"}
bp.show()
bp.figure(plot_width=900, plot_height=700, title="Subreddit Map by Interesting Dimensions",
x_axis_label = "Guns <–> Soccer (Dimension 44)",
y_axis_label = "Food <–> Programming (Dimension 51)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
min_border=1)
bp.scatter(
x = embedded_coords[:,44][row_selector],
y = embedded_coords[:,51][row_selector],
radius= np.log2(users_per_subreddit[row_selector])/6000,
source=bp.ColumnDataSource({"subreddit": subreddits[row_selector]})
).select(dict(type=HoverTool)).tooltips = {"/r/":"@subreddit"}
bp.show()
scikit-learn
has a clustering packageKMeans
scikit-learn
%%time
from scipy.stats import rankdata
embedded_ranks = np.array([rankdata(c) for c in embedded_coords.T]).T
from sklearn.cluster import KMeans
n_clusters = 20
km = KMeans(n_clusters)
clusters = km.fit_predict(embedded_ranks)
pd.DataFrame( [subreddits[clusters == i][users_per_subreddit[clusters == i].argsort()[-6:][::-1]] for i in range(n_clusters)] )
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])
bp.figure(plot_width=900, plot_height=700, title="Subreddit Map by Interesting Dimensions",
x_axis_label = "Guns <–> Soccer (Dimension 44)",
y_axis_label = "Food <–> Programming (Dimension 51)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
min_border=1)
bp.scatter(
x = embedded_coords[:,44][row_selector],
y = embedded_coords[:,51][row_selector],
color= colormap[clusters[row_selector]],
radius= np.log2(users_per_subreddit[row_selector])/6000,
source=bp.ColumnDataSource({"subreddit": subreddits[row_selector]})
).select(dict(type=HoverTool)).tooltips = {"/r/":"@subreddit"}
bp.show()
scikit-learn
has a manifold learning packageTSNE
scikit-learn
%%time
from sklearn.manifold import TSNE
xycoords = TSNE().fit_transform(embedded_coords[row_selector])
bp.figure(plot_width=900, plot_height=700, title="Subreddit Map by t-SNE",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
bp.scatter(
x = xycoords[:,0],
y = xycoords[:,1],
color= colormap[clusters[row_selector]],
radius= np.log2(users_per_subreddit[row_selector])/60,
source=bp.ColumnDataSource({"subreddit": subreddits[row_selector]})
).select(dict(type=HoverTool)).tooltips = {"/r/":"@subreddit"}
bp.show()
TruncatedSVD
to make it into a surprisingly informative smaller matrixKMeans
to group our subreddits into some clustersTSNE
to get coordinates for a scatterplotbokeh
to make an interactive graphicTruncatedSVD
and TSNE
as beforeTruncatedSVD
, KMeans
and TSNE
are some of my go-to algorightms but each belongs to a family with other optionsscikit-learn
is nice, but some of these algorithms can be slow and hard to use