准备环境

anaconda

nano ~/.zshrc
export PATH=$PATH:/anaconda/bin
source ~/.zshrc
echo $HOME
echo $PATH

ipython

conda update conda && conda update ipython ipython-notebook ipython-qtconsole
conda install scipy

PYTHONPATH

export SPARK_HOME=/Users/erichan/garden/spark-1.5.1-bin-hadoop2.6
export PYTHONPATH=${SPARK_HOME}/python/:${SPARK_HOME}/python/lib/py4j-0.8.2.1-src.zip

运行环境

cd $SPARK_HOME

IPYTHON=1 IPYTHON_OPTS="--pylab" ./bin/pyspark

数据

1. 获取原始数据

PATH = "/Users/erichan/sourcecode/book/Spark机器学习"
user_data = sc.textFile("%s/ml-100k/u.user" % PATH)
user_fields = user_data.map(lambda line: line.split("|"))
movie_data = sc.textFile("%s/ml-100k/u.item" % PATH)
movie_fields = movie_data.map(lambda lines: lines.split("|"))
rating_data_raw = sc.textFile("%s/ml-100k/u.data" % PATH)
rating_data = rating_data_raw.map(lambda line: line.split("\t"))
num_movies = movie_data.count()
print num_movies

1682

user_data.first()

u'1|24|M|technician|85711'

movie_data.first()

u'1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0'

rating_data_raw.first()

u'196\t242\t3\t881250949'

2. 探索数据

2.1. 按列统计
num_users = user_fields.map(lambda fields: fields[0]).count()
num_genders = user_fields.map(lambda fields: fields[2]).distinct().count()
num_occupations = user_fields.map(lambda fields: fields[3]).distinct().count()
num_zipcodes = user_fields.map(lambda fields: fields[4]).distinct().count() ratings = rating_data.map(lambda fields: int(fields[2]))
num_ratings = ratings.count()
max_rating = ratings.reduce(lambda x, y: max(x, y))
min_rating = ratings.reduce(lambda x, y: min(x, y))
mean_rating = ratings.reduce(lambda x, y: x + y) / float(num_ratings)
median_rating = np.median(ratings.collect())
ratings_per_user = num_ratings / num_users
ratings_per_movie = num_ratings / num_movies
print "Users: %d, genders: %d, occupations: %d, ZIP codes: %d" % (num_users, num_genders, num_occupations, num_zipcodes)

Users: 943, genders: 2, occupations: 21, ZIP codes: 795

print "Min rating: %d" % min_rating

Min rating: 1

print "Max rating: %d" % max_rating

Max rating: 5

print "Average rating: %2.2f" % mean_rating

Average rating: 3.53

print "Median rating: %d" % median_rating

Median rating: 4

print "Average # of ratings per user: %2.2f" % ratings_per_user

Average # of ratings per user: 106.00

print "Average # of ratings per movie: %2.2f" % ratings_per_movie

Average # of ratings per movie: 59.00

ratings.stats()

(count: 100000, mean: 3.52986, stdev: 1.12566797076, max: 5, min: 1)

2.2. 使用matplotlib的hist函数绘制直方图
ages = user_fields.map(lambda x: int(x[1])).collect()
hist(ages, bins=20, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

count_by_rating = ratings.countByValue()
x_axis = np.array(count_by_rating.keys())
y_axis = np.array([float(c) for c in count_by_rating.values()])
# we normalize the y-axis here to percentages
y_axis_normed = y_axis / y_axis.sum() pos = np.arange(len(x_axis))
width = 1.0 ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(x_axis) plt.bar(pos, y_axis_normed, width, color='lightblue')
plt.xticks(rotation=30)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

count_by_occupation = user_fields.map(lambda fields: (fields[3], 1)).reduceByKey(lambda x, y: x + y).collect()
x_axis1 = np.array([c[0] for c in count_by_occupation])
y_axis1 = np.array([c[1] for c in count_by_occupation])
x_axis = x_axis1[np.argsort(y_axis1)]
y_axis = y_axis1[np.argsort(y_axis1)] pos = np.arange(len(x_axis))
width = 1.0 ax = plt.axes()
ax.set_xticks(pos + (width / 2))
ax.set_xticklabels(x_axis) plt.bar(pos, y_axis, width, color='lightblue')
plt.xticks(rotation=30)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16, 10)

2.3. 使用countByValue函数统计
count_by_occupation2 = user_fields.map(lambda fields: fields[3]).countByValue()
print "Map-reduce approach:"
print dict(count_by_occupation2)

{u'administrator': 79, u'retired': 14, u'lawyer': 12, u'healthcare': 16, u'marketing': 26, u'executive': 32, u'scientist': 31, u'student': 196, u'technician': 27, u'librarian': 51, u'programmer': 66, u'salesman': 12, u'homemaker': 7, u'engineer': 67, u'none': 9, u'doctor': 7, u'writer': 45, u'entertainment': 18, u'other': 105, u'educator': 95, u'artist': 28}

print ""
print "countByValue approach:"
print dict(count_by_occupation)

{u'administrator': 79, u'writer': 45, u'retired': 14, u'lawyer': 12, u'doctor': 7, u'marketing': 26, u'executive': 32, u'none': 9, u'entertainment': 18, u'healthcare': 16, u'scientist': 31, u'student': 196, u'educator': 95, u'technician': 27, u'librarian': 51, u'programmer': 66, u'artist': 28, u'salesman': 12, u'other': 105, u'homemaker': 7, u'engineer': 67}

2.4. 使用filter转换
def convert_year(x):
try:
return int(x[-4:])
except:
return 1900 years = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x))
years_filtered = years.filter(lambda x: x != 1900)
movie_ages = years_filtered.map(lambda yr: 1998-yr).countByValue()
values = movie_ages.values()
bins = movie_ages.keys()
hist(values, bins=bins, color='lightblue', normed=True)

(array([ 0. , 0.07575758, 0.09090909, 0.09090909, 0.18181818,
0.18181818, 0.04545455, 0.07575758, 0.07575758, 0.03030303,
0. , 0.01515152, 0.01515152, 0.03030303, 0. ,
0.03030303, 0. , 0. , 0. , 0. ,
0. , 0. , 0.01515152, 0. , 0.01515152,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0.01515152, 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0. , 0. , 0. , 0. , 0. ,
0.01515152, 0. , 0. , 0. , 0. ]),
array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
68, 72, 76]),
)

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

2.5. 使用groupByKey分组
# to compute the distribution of ratings per user, we first group the ratings by user id
user_ratings_grouped = rating_data.map(lambda fields: (int(fields[0]), int(fields[2]))).groupByKey()
# then, for each key (user id), we find the size of the set of ratings, which gives us the # ratings for that user
user_ratings_byuser = user_ratings_grouped.map(lambda (k, v): (k, len(v)))
user_ratings_byuser.take(5)

[(2, 62), (4, 24), (6, 211), (8, 59), (10, 184)]

user_ratings_byuser_local = user_ratings_byuser.map(lambda (k, v): v).collect()
hist(user_ratings_byuser_local, bins=200, color='lightblue', normed=True)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(16,10)

3. 处理转换

3.1. 填充缺失
years_pre_processed = movie_fields.map(lambda fields: fields[2]).map(lambda x: convert_year(x)).filter(lambda yr: yr != 1900).collect()
years_pre_processed_array = np.array(years_pre_processed)
# first we compute the mean and median year of release, without the 'bad' data point
mean_year = np.mean(years_pre_processed_array[years_pre_processed_array!=1900])
median_year = np.median(years_pre_processed_array[years_pre_processed_array!=1900])
idx_bad_data = np.where(years_pre_processed_array==1900)[0]
years_pre_processed_array[idx_bad_data] = median_year
print "Mean year of release: %d" % mean_year

Mean year of release: 1989

print "Median year of release: %d" % median_year

Median year of release: 1995

print "Index of '1900' after assigning median: %s" % np.where(years_pre_processed_array == 1900)[0]

Index of '1900' after assigning median: []

4. 提取特征

4.1. 类别特征(norminal变量/ordinal变量)
all_occupations = user_fields.map(lambda fields: fields[3]).distinct().collect()
all_occupations.sort()
# create a new dictionary to hold the occupations, and assign the "1-of-k" indexes
idx = 0
all_occupations_dict = {}
for o in all_occupations:
all_occupations_dict[o] = idx
idx +=1 # try a few examples to see what "1-of-k" encoding is assigned
print "Encoding of 'doctor': %d" % all_occupations_dict['doctor']
print "Encoding of 'programmer': %d" % all_occupations_dict['programmer']

Encoding of 'doctor': 2
Encoding of 'programmer': 14

numpy的zeros函数
K = len(all_occupations_dict)
binary_x = np.zeros(K)
k_programmer = all_occupations_dict['programmer']
binary_x[k_programmer] = 1
print "Binary feature vector: %s" % binary_x
print "Length of binary vector: %d" % K

Binary feature vector: [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.

  1. 0. 0.] Length of binary vector: 21
4.2. 派生特征
时间戳转换为类别特征
def extract_datetime(ts):
import datetime
return datetime.datetime.fromtimestamp(ts) def assign_tod(hr):
times_of_day = {
'morning' : range(7, 12),
'lunch' : range(12, 15),
'afternoon' : range(15, 18),
'evening' : range(18, 23),
'night' : {23,24,0,1,2,3,4,5,6,7}
}
for k, v in times_of_day.iteritems():
if hr in v:
return k timestamps = rating_data.map(lambda fields: int(fields[3]))
hour_of_day = timestamps.map(lambda ts: extract_datetime(ts).hour)
# now apply the "time of day" function to the "hour of day" RDD
time_of_day = hour_of_day.map(lambda hr: assign_tod(hr))
timestamps.take(5)

[881250949, 891717742, 878887116, 880606923, 886397596]

hour_of_day.take(5)

[23, 3, 15, 13, 13]

time_of_day.take(5)

['night', 'night', 'afternoon', 'lunch', 'lunch']

4.3. 文本特征
def extract_title(raw):
import re
grps = re.search("\((\w+)\)", raw)
if grps:
return raw[:grps.start()].strip()
else:
return raw raw_titles = movie_fields.map(lambda fields: fields[1])
for raw_title in raw_titles.take(5):
print extract_title(raw_title)

Toy Story
GoldenEye
Four Rooms
Get Shorty
Copycat

movie_titles = raw_titles.map(lambda m: extract_title(m))
# next we tokenize the titles into terms. We'll use simple whitespace tokenization
title_terms = movie_titles.map(lambda t: t.split(" "))
print title_terms.take(5)

[[u'Toy', u'Story'], [u'GoldenEye'], [u'Four', u'Rooms'], [u'Get', u'Shorty'], [u'Copycat']]

flatMap
all_terms = title_terms.flatMap(lambda x: x).distinct().collect()
# create a new dictionary to hold the terms, and assign the "1-of-k" indexes
idx = 0
all_terms_dict = {}
for term in all_terms:
all_terms_dict[term] = idx
idx +=1 num_terms = len(all_terms_dict)
print "Total number of terms: %d" % num_terms

Total number of terms: 2645

print "Index of term 'Dead': %d" % all_terms_dict['Dead']

Index of term 'Dead': 147

print "Index of term 'Rooms': %d" % all_terms_dict['Rooms']

Index of term 'Rooms': 1963

zipWithIndex
all_terms_dict2 = title_terms.flatMap(lambda x: x).distinct().zipWithIndex().collectAsMap()
print "Index of term 'Dead': %d" % all_terms_dict2['Dead']
print "Index of term 'Rooms': %d" % all_terms_dict2['Rooms']

Index of term 'Dead': 147
Index of term 'Rooms': 1963

创建稀疏向量/广播变量

scipy depends $PYTHONPATH

def create_vector(terms, term_dict):
from scipy import sparse as sp
x = sp.csc_matrix((1, num_terms))
for t in terms:
if t in term_dict:
idx = term_dict[t]
x[0, idx] = 1
return x all_terms_bcast = sc.broadcast(all_terms_dict)
term_vectors = title_terms.map(lambda terms: create_vector(terms, all_terms_bcast.value))
term_vectors.take(5)

[<1x2645 sparse matrix of type ''
with 1 stored elements in Compressed Sparse Column format>,
<1x2645 sparse matrix of type ''
with 1 stored elements in Compressed Sparse Column format>,
<1x2645 sparse matrix of type ''
with 1 stored elements in Compressed Sparse Column format>,
<1x2645 sparse matrix of type ''
with 1 stored elements in Compressed Sparse Column format>,
<1x2645 sparse matrix of type ''
with 1 stored elements in Compressed Sparse Column format>]

4.4. 正则化特征
np.random.seed(42)
x = np.random.randn(10)
norm_x_2 = np.linalg.norm(x)
normalized_x = x / norm_x_2
print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x:\n%s" % normalized_x
print "2-Norm of normalized_x: %2.4f" % np.linalg.norm(normalized_x)

x:
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]
2-Norm of x: 2.5908
Normalized x:
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812081 0.20941776]
2-Norm of normalized_x: 1.0000

from pyspark.mllib.feature import Normalizer
normalizer = Normalizer()
vector = sc.parallelize([x])
normalized_x_mllib = normalizer.transform(vector).first().toArray() print "x:\n%s" % x
print "2-Norm of x: %2.4f" % norm_x_2
print "Normalized x MLlib:\n%s" % normalized_x_mllib
print "2-Norm of normalized_x_mllib: %2.4f" % np.linalg.norm(normalized_x_mllib)

x:
[ 0.49671415 -0.1382643 0.64768854 1.52302986 -0.23415337 -0.23413696
1.57921282 0.76743473 -0.46947439 0.54256004]
2-Norm of x: 2.5908
Normalized x MLlib:
[ 0.19172213 -0.05336737 0.24999534 0.58786029 -0.09037871 -0.09037237
0.60954584 0.29621508 -0.1812.20941776]
2-Norm of normalized_x_mllib: 1.0000

Spark机器学习2·准备数据(pyspark)的更多相关文章

  1. 客户流失?来看看大厂如何基于spark+机器学习构建千万数据规模上的用户留存模型 ⛵

    作者:韩信子@ShowMeAI 大数据技术 ◉ 技能提升系列:https://www.showmeai.tech/tutorials/84 行业名企应用系列:https://www.showmeai. ...

  2. Spark机器学习5·回归模型(pyspark)

    分类模型的预测目标是:类别编号 回归模型的预测目标是:实数变量 回归模型种类 线性模型 最小二乘回归模型 应用L2正则化时--岭回归(ridge regression) 应用L1正则化时--LASSO ...

  3. Spark机器学习之MLlib整理分析

    友情提示: 本文档根据林大贵的<Python+Spark 2.0 + Hadoop机器学习与大数据实战>整理得到,代码均为书中提供的源码(python 2.X版本). 本文的可以利用pan ...

  4. Spark机器学习MLlib系列1(for python)--数据类型,向量,分布式矩阵,API

    Spark机器学习MLlib系列1(for python)--数据类型,向量,分布式矩阵,API 关键词:Local vector,Labeled point,Local matrix,Distrib ...

  5. Spark机器学习 Day2 快速理解机器学习

    Spark机器学习 Day2 快速理解机器学习 有两个问题: 机器学习到底是什么. 大数据机器学习到底是什么. 机器学习到底是什么 人正常思维的过程是根据历史经验得出一定的规律,然后在当前情况下根据这 ...

  6. Spark机器学习 Day1 机器学习概述

    Spark机器学习 Day1 机器学习概述 今天主要讨论个问题:Spark机器学习的本质是什么,其内部构成到底是什么. 简单来说,机器学习是数据+算法. 数据 在Spark中做机器学习,肯定有数据来源 ...

  7. Spark机器学习笔记一

    Spark机器学习库现支持两种接口的API:RDD-based和DataFrame-based,Spark官方网站上说,RDD-based APIs在2.0后进入维护模式,主要的机器学习API是spa ...

  8. Spark机器学习之协同过滤算法

    Spark机器学习之协同过滤算法 一).协同过滤 1.1 概念 协同过滤是一种借助"集体计算"的途径.它利用大量已有的用户偏好来估计用户对其未接触过的物品的喜好程度.其内在思想是相 ...

  9. Spark机器学习解析下集

    上次我们讲过<Spark机器学习(上)>,本文是Spark机器学习的下部分,请点击回顾上部分,再更好地理解本文. 1.机器学习的常见算法 常见的机器学习算法有:l   构造条件概率:回归分 ...

随机推荐

  1. 利用JQuery jsonp实现Ajax跨域请求 .Net 的*.handler 和 WebService,返回json数据

    1:跨域请求handler一般处理程序 using System; using System.Collections.Generic; using System.Web; using System.W ...

  2. 【转】SetThreadLocale解决越南文乱码问题

    转自http://hi.baidu.com/killwolf110/item/838d56224067c63395f62b70 程序需要运行在越南地区,语言为越南文,操作系统为英文版,程序支持unic ...

  3. css position float (写的相当好)

    对CSS中的Position.Float属性的一些深入探讨 对于Position.Float我们在平时使用上可以说是使用频率非常高的两个CSS属性,对于这两个属性的使用上面可能大多数人存在一些模糊与不 ...

  4. python入门(三):分支、循环、函数

    1.分支 if循环格式:if condition_1: statement_block_1elif condition_2: statement_block_2else: statement_bloc ...

  5. Xcode不自动提示代码

    今天群里有个小朋友惊慌了,“啊啊啊,我的Xcode不能提示代码了,文字都变成黑的了,可怎么办呀...”看到这个我真的是无语了,随手百度一下 ,一大把好不啦,何须惊慌,姐姐我在几年前就遇到了,好在今天不 ...

  6. mysql数据库sql优化——子查询优化

    1.什么是子查询.表关联查询: 子查询:是指在主sql语句中的select或where子句中使用select查询语句:select a.name,(select b.name from b where ...

  7. cocos3.x 接入微信无法调用回调函数onResp的问题

    要想顺利调用必须保证一下几点: 1.WXEntryActivity的包名必须正确,格式为你的APK包名+wxapi.WXEntryActivity(注意:是apk包名,而不是org.cocos2dx. ...

  8. poj3411

    Paid Roads Time Limit: 1000MS   Memory Limit: 65536K Total Submissions: 6549   Accepted: 2427 Descri ...

  9. Store update, insert, or delete statement affected an unexpected number of rows (0). Entities may have been modified or deleted since entities were loaded.

    EF6进行Insert操作的时候提示错误 Store update, insert, or delete statement affected an unexpected number of rows ...

  10. 【BZOJ4688】One-Dimensional 矩阵乘法

    [BZOJ4688]One-Dimensional Description 考虑一个含有 N 个细胞的一维细胞自动机.细胞从 0 到 N-1 标号.每个细胞有一个被表示成一个小于 M 的非负整数的状态 ...