这是微博深度和广度预测的原始代码,写了大约半个月,第一个版本不是这样的,但是这个版本包含所有需要的功能。

模块化的程度也更高。找工作前一直想用python完美解决这个问题,后来发现自己的方法和硬件都有很大的局限。

算是我的第一次正儿八经的尝试在分布式计算的框架下,计算海量的数据。

意识到很多问题,影响我面试时候很多的代码风格。

def get_basic_info():
win_path = "E:/spark/weibo_predict/"
linux_path = "/home/jason/spark/weibo_predict/"
path = linux_path train_path = path + 'train/'
test_path = path + 'test/'
code_path = path + 'source_code/' print('\n训练准备文件保存路径px:%s' % train_path)
print('\n测试准备文件保存路径py:%s' % test_path)
print('\n代码准备文件保存路径pz:%s' % code_path) train_weibo_raw_path = path + "train_weibo_raw.txt"
train_weibo_repost_path = path + "train_weibo_repost_back.txt" test_weibo_raw_path = path + "test_weibo_raw.txt"
test_weibo_repost_path = path + "test_weibo_repost.txt" user_relations_path = path + "user_relations_back.txt" print("\n训练原始微博地址p1:%s" % train_weibo_raw_path)
print("训练转发微博地址p2:%s" % train_weibo_repost_path)
print("\n测试原始微博地址p3:%s" % test_weibo_raw_path)
print("测试转发微博地址p4:%s" % test_weibo_repost_path)
print("\n 用户关系地址p5:%s" % user_relations_path)
return train_path,test_path,code_path,train_weibo_raw_path,train_weibo_repost_path,test_weibo_raw_path,test_weibo_repost_path,user_relations_path
#传递  训练(原始微博,转发微博) 或者 测试(原始微博,转发微博)
#返回化简后的对应关系repost_id_line_time_reduce
#返回微博id对应的用户idwid_uid_rdd
from pyspark import SparkContext
def get_prime_rdd(train_or_test,sc, p1,p2,p3,p4):
if train_or_test == 'train':
inside_path_a = p1
inside_path_b = p2
elif train_or_test == 'test':
inside_path_a = p3
inside_path_b = p4
else:
print("only input train or test")
return 0,0 sc = sc
train_weibo_raw_data = sc.textFile(inside_path_a)
train_weibo_raw_data_count = train_weibo_raw_data.count()
train_weibo_raw_data_rdd = train_weibo_raw_data.map(lambda x: x.split("\001"))
w_id=train_weibo_raw_data_rdd.map(lambda x:x[0])
u_id=train_weibo_raw_data_rdd.map(lambda x:x[1])
wid_uid_rdd = w_id.zip(u_id) train_weibo_repost_data = sc.textFile(inside_path_b)
train_weibo_repost_data_count = train_weibo_repost_data.count()
train_weibo_repost_data_rdd = train_weibo_repost_data.map(lambda x: x.split("\001"))
repost_id = train_weibo_repost_data_rdd.map(lambda x: x[0])
repost_line_time = train_weibo_repost_data_rdd.map(lambda x: x[1:-1])
repost_id_line_time = repost_id.zip(repost_line_time)
repost_id_line_time_reduce = repost_id_line_time.groupByKey().mapValues(list) repost_id_line_time_reduce = repost_id_line_time_reduce.subtractByKey(repost_id_line_time_reduce.subtractByKey(wid_uid_rdd))
wid_uid_rdd = wid_uid_rdd.subtractByKey(wid_uid_rdd.subtractByKey(repost_id_line_time_reduce)) return repost_id_line_time_reduce,wid_uid_rdd
def get_uid_fnum_rdd(sc,p5):
sc = sc
user_relations_data = sc.textFile(p5)
user_relations_data_count = user_relations_data.count()
user_relations_data_rdd_1 = user_relations_data.map(lambda x: x.split("\t")[0])
user_relations_data_rdd_2 = user_relations_data.map(lambda x: x.split("\t")[1])
user_relations_data_rdd_user = user_relations_data_rdd_1
user_relations_data_rdd_fans = user_relations_data_rdd_2.map(lambda x: x.split("\x01"))
user_fans = user_relations_data_rdd_user.zip(user_relations_data_rdd_fans)
fans_nums = user_relations_data_rdd_fans.map(lambda s:len(s))
uid_fnum_rdd = user_fans.keys().zip(fans_nums)
return uid_fnum_rdd
##版本 2  分时间段计算指定时间段的转发量
def cal_times_j(list,j):
ct = 0
for i in range(len(list)):
#if int(list[i][-1]) >= j*900 and int(list[i][-1]) <= (j+1)*900:
#这里可以切换求累计的转发量还是区间的转发量
if int(list[i][-1]) <= (j)*900:
ct += 1
return ct
def cal_id_times_j(rdd,j):
times = rdd.values().map(lambda x: cal_times_j(x,j))
rdd = rdd.keys().zip(times)
return rdd def generate_times_file(rdd,k,path):
for j in range(k-1,k+1):
import csv
a_path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
#print(path)
out_file_train_times_j = open(a_path,'w')
writer = csv.writer(out_file_train_times_j);
zhuanfa = cal_id_times_j(rdd,j+1)
for lists in zhuanfa.collect():
writer.writerow(lists)
out_file_train_times_j.close()
#计算深度
#定义函数,计算出指定阶段的,发生过的转发关系
def cal_during(list,j):
new_list=[]
for i in range(len(list)):
if int(list[i][-1]) <= j*900:
new_list.append(list[i])
return new_list #定义函数,计算一个rdd中,指定阶段,发生过的转发关系
def cal_rdd_during(rdd,j):
return rdd.map(lambda x: cal_during(x,j)) #定义函数,如果一个转发关系的尾部,是另外一个转发关系的头,那么久把这个头的尾部,加到这个转发关系的尾部
def add_deep(list):
kkk = len(list)
if kkk<=1:
return list
else:
for i in range(kkk):
for j in range(kkk):
if list[i][-1] == list[j][0]:
list[i].append(list[j][-1])
return list #定义函数返回序列中的数组的最长的值,作为最大的深度
def max_deep(list):
max=2
if len(list)==0:
return 0
else:
for i in range(len(list)):
max = (len(list[i]) if len(list[i])> max else max)
return max-1 #定义函数,取出其中的两列
def ti_qu(list):
for i in range(len(list)):
list[i] = list[i][:-1]
return list def cal_cal(all_in_one_rdd, j):
id_rdd = all_in_one_rdd.keys() #获取ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
line_rdd_j_extend = line_rdd_j.map(lambda x: add_deep(x))#延长转发关系
line_rdd_j_extend_maxdeep = line_rdd_j_extend.map(lambda x:max_deep(x))#计算最大深度
id_deep_rdd_j = id_rdd.zip(line_rdd_j_extend_maxdeep)#组合微博ID与深度
return id_deep_rdd_j def generate_deeps_file(rdd,k,path):
import csv
for j in range(k-1,k+1):
b_path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
#print(path)
out_file_train_deeps_j = open(b_path,'w')
writer = csv.writer(out_file_train_deeps_j);
shendu = cal_cal(rdd,j+1)
for lists in shendu.collect():
writer.writerow(lists)
out_file_train_deeps_j.close()
def get_wid_fnum_rdd(uid_fnum_rdd,wid_uid_rdd,path):
#print("用户和粉丝个数的对应关系,取出来一个看看:")
#print(uid_fnum_rdd.take(3))
#print(uid_fnum_rdd.count())
#print("\n训练原始约减微博的id和发送微博的人的id的对应rdd:")
#print(wid_uid_rdd.take(3))
#print(wid_uid_rdd.count())
uid_wid_rdd = wid_uid_rdd.values().zip(wid_uid_rdd.keys())
uid__wid_fnum = uid_wid_rdd.leftOuterJoin(uid_fnum_rdd)
wid_fnum_rdd = uid__wid_fnum.values().map(lambda x: x[0]).zip(uid__wid_fnum.values().map(lambda x: x[1]))
#print(wid_fnum_rdd.take(2))
#print(wid_fnum_rdd.count())
import csv
c_path = str(path) + 'wid_fnum_file.csv'
wid_fnum_file = open(c_path,"w")
writer = csv.writer(wid_fnum_file);
for lists in wid_fnum_rdd.collect():
writer.writerow(lists);
wid_fnum_file.close() return wid_fnum_rdd
#定义函数,将列表数组扁平化
def add_flat(list):
if list==None:
return 0
else:
kkk = len(list)
list0 = list[0]
for i in range(kkk):
if i==0:
pass
else:
list0 = list0.append(list[i])
return list0 #定义函数,计算覆盖用户数目
def clac_cover(list):
total_cover=0
for i in range(len(list)):
total_cover += cover_value(list[i])
return total_cover #定义函数,计算某个用户的粉丝数:
def cover_value(user):
'''
try:
return uid_fnum_dict[user]
except:
return 0
'''
for i in range(len(list_uid_fnum)):
if user == list_uid_fnum[i][0]:
return list_uid_fnum[i][1]
else:
return 0
def flatmapvalues(x):
return x def cal_sum(x):
sum = 0
if x==None and len(x)==0:
return sum
else:
for i in range(len(x)):
if x[i]== None:
pass
else:
sum += int(x[i])
return sum def fans_cover_till_j(all_in_one_rdd,j):
id_rdd = all_in_one_rdd.keys() #获取微博ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
#print("\n指定时间段,获取这个时间段发生过的转发和时间组成的RDD");print(line_time_rdd_j.first())
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
#print("\n提取转发关系");print(line_rdd_j.first()) #line_rdd_j.flatMap(lambda x: re.sub(r'\D'," ",x).split())
#line_rdd_j_flat = line_rdd_j.map(lambda x: add_flat(x))#扁平化转发关系,不行
import re
line_rdd_j_flat = line_rdd_j.map(lambda x: re.sub(r'\D'," ",str(x)).split())#扁平化转发关系
#print("\n提取扁平化的转发关系");print(line_rdd_j_flat.first()) line_rdd_j_flat_disc = line_rdd_j_flat.map(lambda x:list(set(list(x)))) #扁平化之后约减重复的用户ID
#print("\n看看去重之后的转发用户");print(line_rdd_j_flat_disc.first()) fans_cover_rdd_j = id_rdd.zip(line_rdd_j_flat_disc)
#print("\n看看去重之后的微博ID和转发用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.flatMapValues(flatmapvalues)
#print("\n看看去重之后的微博ID和转发用户,一对一flatmap之后");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.values().zip(fans_cover_rdd_j.keys())
#print("\n翻转id和用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.leftOuterJoin(uid_fnum_rdd).values()
#print("\n得到用户id_(微博ID,粉丝)");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.map(lambda x: x[0]).zip(fans_cover_rdd_j.map(lambda x:x[1]))
#print("\n得微博id_粉丝");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.groupByKey().mapValues(list)
#print("\n组合,");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
#print("\nmap求和");print(fans_cover_rdd_j.first()) #cover_rdd = line_rdd_j_flat_disc.map(lambda x: clac_cover(x))
#fans_cover_rdd_j = id_rdd.zip(cover_rdd)#组合微博ID与覆盖数目
#print(id_deep_rdd_j.first())
#return line_rdd_j_extend_maxdeep
temp_key_0 = all_in_one_rdd.keys().zip(all_in_one_rdd.values().map(lambda x: 0)) fans_cover_rdd_j = temp_key_0.leftOuterJoin(fans_cover_rdd_j)
fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x))) return fans_cover_rdd_j def generate_covers_file(rdd,k,path):
#按理说没问题
import csv
for j in range(k-1,k+1):
c_path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
#print(c_path)
out_file_train_covers_j = open(c_path,'w')
writer = csv.writer(out_file_train_covers_j)
covers = fans_cover_till_j(rdd,j+1)
for lists in covers.collect():
writer.writerow(lists)
out_file_train_covers_j.close()
px,py,pz,p1,p2,p3,p4,p5 = get_basic_info()
uid_fnum_rdd = get_uid_fnum_rdd(sc,p5)
train_repost_id_line_time_reduce, train_wid_uid_rdd = get_prime_rdd('train',sc,p1,p2,p3,p4)
#wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,train_wid_uid_rdd,px)
#generate_times_file(train_repost_id_line_time_reduce,292,px)
#generate_deeps_file(train_repost_id_line_time_reduce,292,px)
#generate_covers_file(train_repost_id_line_time_reduce,292,px) test_repost_id_line_time_reduce, test_wid_uid_rdd = get_prime_rdd('test',sc,p1,p2,p3,p4)
#test_wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,test_wid_uid_rdd,py)
#generate_times_file(test_repost_id_line_time_reduce,16,py)
#generate_deeps_file(test_repost_id_line_time_reduce,16,py)
#generate_covers_file(test_repost_id_line_time_reduce,16,py)
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import SparseVector,DenseVector #获取用户ID和粉丝数的对比
def get_wid_fnum_rdd(path):
path = path+ 'wid_fnum_file'+'.csv'
wid_fnum_rdd = sc.textFile(path)
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x.split(","))
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x[0]).zip(wid_fnum_rdd.map(lambda x:x[1]))
wid_fnum_rdd = wid_fnum_rdd.sortByKey()
return wid_fnum_rdd def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd #获取其他三个需要的参数
def get_wid_x(j,path,times_or_deeps_or_covers):
if times_or_deeps_or_covers == 'times':
if path == px:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'deeps':
if path == px:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'covers':
if path == px:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/covers_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
else:
print('wrong input about times_or_deeps_or_covers')
return 0
rdd = sc.textFile(path)
rdd = rdd.map(lambda x:x.split(","))
rdd = rdd.map(lambda x:x[0]).zip(rdd.map(lambda x:x[1]))
rdd = rdd.sortByKey()
return rdd #将两个RDDjoin返回一个rdd的函数
def my_join(rdd1,rdd2):
import re
rdd = rdd1.join(rdd2).keys().zip(rdd1.join(rdd2).values().map(lambda x:re.sub(r'\D'," ",str(x)).split()))
return rdd #根据rdd的元素制作lib_svm格式文件
def lib_svm(x):
str1 = str(x[0] + ' ')
for i in range(len(x)):
if i == 0:
pass
else:
str1 += str(str(i) + ":" +str(x[i])+ ' ')
return str1 #生成测试或者训练需要的数据
def generate_train_or_test_data(path,j,times_or_deeps):
if times_or_deeps == 'times':
if path == px:
data_path = str(px) + 'train_data/times_train_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j+1,path,'times')
elif path == py:
data_path = str(py) + 'test_data/times_test_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j,path,'times')
#print(wid_times_rdd.count())
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_times_rdd,wid_fnum_rdd)
records = my_join(records,wid_deeps_rdd)
records = my_join(records,wid_covers_rdd)
records = records.sortByKey()
#print('看看训练集合中的keys()的顺序-------------------------------------------')
#print(records.keys().take(10))
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
elif times_or_deeps == 'deeps':
if path == px:
data_path = str(px) + 'train_data/deeps_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/deeps_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_deeps_rdd = get_wid_x(j,path,'deeps')
else:
wid_deeps_rdd = get_wid_x(j+1,path,'deeps')
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_deeps_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_covers_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
elif times_or_deeps == 'covers':
if path == px:
data_path = str(px) + 'train_data/covers_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/covers_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_covers_rdd = get_wid_x(j,path,'covers')
else:
wid_covers_rdd = get_wid_x(j+1,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps') records = my_join(wid_covers_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_deeps_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
else:
return 0 #生成指定时段的预测结果
def generate_test_predict(j,times_or_deeps):
if times_or_deeps == 'times':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'times_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'times_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'times_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'deeps':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'deeps_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'deeps_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'deeps_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'covers':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'covers_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'covers_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'covers_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close() def generate_test_data_beyond15(j):
path = '/home/jason/spark/weibo_predict/predicts/'+'time_data_'+str(j)+'.txt'
rdd2 = sc.textFile(path)
rdd1 = get_wid_fnum_rdd(py).keys()
rdd = rdd1.zip(rdd2)
return rdd
def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd
for i in range(15):
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
for i in range(15,292):
print(i)
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
generate_train_or_test_data(px,291,'times')
generate_train_or_test_data(py,291,'times')
generate_test_predict(291,'times')
generate_train_or_test_data(px,291,'deeps')
generate_train_or_test_data(py,291,'deeps')
generate_test_predict(291,'deeps')
generate_train_or_test_data(px,291,'covers')
generate_train_or_test_data(py,291,'covers')
generate_test_predict(291,'covers')
#组团搞出来最后的文件

rdd1 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(1)+'.txt')
rdd1 = add_keys(rdd1)
for j in range(4,292):
j = j+1
if j==1:
pass
else:
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt')
rdd2 = add_keys(rdd2)
rdd1 = my_join(rdd1,rdd2) for j in range(4,292):
j=j+1
rdd3 = sc.textFile('/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt')
rdd3 = add_keys(rdd3)
rdd1 = my_join(rdd1,rdd3) def add_head(x):
str1 = 'testWeibo'
str1 = str1+str(x)
return str1 import re
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split())
rdd1 = rdd1.sortBy(lambda x: int(x[0]))
rdd1 = rdd1.map(lambda x:x[0]).zip(rdd1.map(lambda x:x[1:]))
rdd1_key = rdd1.keys().map(lambda x:add_head(x))
rdd1 = rdd1_key.zip(rdd1.values())
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split()) import csv
path = '/home/jason/spark/weibo_predict/'
end_path = str(path) + 'end_of_end.csv'
end_f = open(end_path,'w')
writer = csv.writer(end_f)
for lists in rdd1.collect():
writer.writerow(lists)
end_f.close()
a=','
s1 = ['scaleT'+str((i+1)*15) for i in range(4,292)]
s1 = a.join(s1)
s2 = ['depthT'+str((i+1)*15) for i in range(4,292)]
s2 = a.join(s2)
s3 = 'WeiboID (Time Unit: Minutes)'+a+s1+s2
#print(s3)
end_path_2 = '/home/jason/spark/weibo_predict/end_of_end.csv'
end_path_1 = '/home/jason/spark/weibo_predict/end_of_end_.csv'
rdd = sc.textFile(end_path_2)
rdd = rdd.map(lambda x:add_head(x))
end_ff = open(end_path_1,'w')
end_ff.write(s3)
end_ff.write('\n')
for lists in rdd.collect():
end_ff.write(lists)
end_ff.write('\n')
end_ff.close()

shape into blocks--source code in python based on pySpark的更多相关文章

  1. Source Code Structure - Python 源码目录结构

    Source Code Structure - Python 源码目录结构 Include 目录包含了 Python 提供的所有头文件, 如果用户需要用 C 或 C++ 编写自定义模块扩展 Pytho ...

  2. convert source code files to pdf format in python

    import os import sys def find_file(root_dir, type): dirs_pool = [root_dir] dest_pool = [] def scan_d ...

  3. Defining Python Source Code Encodings

    Defining the Encoding Python will default to ASCII as standard encoding if no other encoding hints a ...

  4. Python使用import导入模块时报ValueError: source code string cannot contain null bytes的解决方案

    老猿在导入一个Python模块时报错: >>> import restartnet.py Traceback (most recent call last): File " ...

  5. 从Script到Code Blocks、Code Behind到MVC、MVP、MVVM

    刚过去的周五(3-14)例行地主持了技术会议,主题正好是<UI层的设计模式——从Script.Code Behind到MVC.MVP.MVVM>,是前一天晚上才定的,中午花了半小时准备了下 ...

  6. Tips for newbie to read source code

    This post is first posted on my WeChat public account: GeekArtT Reading source code is always one bi ...

  7. XML.ObjTree -- XML source code from/to JavaScript object like E4X

    转载于:http://www.kawa.net/works/js/xml/objtree-try-e.html // ========================================= ...

  8. Indenting source code

    Artistic Style 1.15.3 A Free , Fast and Small Automatic Formatterfor C , C++ , C# , Java Source Code ...

  9. Source Code Review

    1.berfore we talking abnout the Source Code review,here's what we want to know about the most popula ...

随机推荐

  1. ansible

    3.1 配置 #vim /etc/ansible/hosts       //定义主机,支持IP和域名,支持分组 [local] 127.0.0.1 [nginx] 192.168.0.10     ...

  2. 如何查看Linux操作系统版本

    1. 查看内核版本命令: 360kb.com:~> cat /proc/version Linux version 2.6.32-358.el6.x86_64 (mockbuild@c6b8.b ...

  3. Spring的javaMail邮件发送(带附件)

    项目中经常用到邮件功能,在这里简单的做一下笔记,方便日后温习. 首先需要在配置文件jdbc.properties添加: #------------ Mail ------------ mail.smt ...

  4. Linux workqueue疑问【转】

    转自:http://blog.csdn.net/angle_birds/article/details/9387365 各位大神,你们好.我在使用workqueue的过程中遇到一个问题. 项目采用uC ...

  5. java中OutputStream字节流与字符流InputStreamReader 每一种基本IO流BufferedOutputStream,FileInputStream,FileOutputStream,BufferedInputStream,BufferedReader,BufferedWriter,FileInputStream,FileReader,FileWriter,InputStr

    BufferedOutputStream,FileInputStream,FileOutputStream,BufferedInputStream,BufferedReader,BufferedWri ...

  6. uniq DEMO

    测试数据: [weblogic@etp-mall-dev7][/tmp]$ cat msn.txt aaa bbb bbb ccc ccc ddd bbb eee aaa ccc bbb sss op ...

  7. git 上传本地文件到github

    git 上传本地文件到github 1 git config --global user.name "Your Real Name" 2 git config --global u ...

  8. PHP学习当中遗漏的知识点

    一, 当双引号中包含变量时,变量会与双引号中的内容连接在一起: 当单引号中包含变量时,变量会被当做字符串输出. <?php $love = "I love you!"; $s ...

  9. 收集 关于php的博文

    1. 小狼的世界: 浅谈用php实现mvc:http://www.cnblogs.com/cocowool/archive/2009/09/08/1562874.html 关于MVC的定义和解释,可以 ...

  10. [xcode]instruments来检验你的app

      原文网址:http://www.cocoachina.com/industry/20140114/7696.html     比较了好多关于instruments 还是发现老外写的比较牛逼.于是果 ...