这是微博深度和广度预测的原始代码,写了大约半个月,第一个版本不是这样的,但是这个版本包含所有需要的功能。

模块化的程度也更高。找工作前一直想用python完美解决这个问题,后来发现自己的方法和硬件都有很大的局限。

算是我的第一次正儿八经的尝试在分布式计算的框架下,计算海量的数据。

意识到很多问题,影响我面试时候很多的代码风格。

def get_basic_info():
win_path = "E:/spark/weibo_predict/"
linux_path = "/home/jason/spark/weibo_predict/"
path = linux_path train_path = path + 'train/'
test_path = path + 'test/'
code_path = path + 'source_code/' print('\n训练准备文件保存路径px:%s' % train_path)
print('\n测试准备文件保存路径py:%s' % test_path)
print('\n代码准备文件保存路径pz:%s' % code_path) train_weibo_raw_path = path + "train_weibo_raw.txt"
train_weibo_repost_path = path + "train_weibo_repost_back.txt" test_weibo_raw_path = path + "test_weibo_raw.txt"
test_weibo_repost_path = path + "test_weibo_repost.txt" user_relations_path = path + "user_relations_back.txt" print("\n训练原始微博地址p1:%s" % train_weibo_raw_path)
print("训练转发微博地址p2:%s" % train_weibo_repost_path)
print("\n测试原始微博地址p3:%s" % test_weibo_raw_path)
print("测试转发微博地址p4:%s" % test_weibo_repost_path)
print("\n 用户关系地址p5:%s" % user_relations_path)
return train_path,test_path,code_path,train_weibo_raw_path,train_weibo_repost_path,test_weibo_raw_path,test_weibo_repost_path,user_relations_path
#传递  训练(原始微博,转发微博) 或者 测试(原始微博,转发微博)
#返回化简后的对应关系repost_id_line_time_reduce
#返回微博id对应的用户idwid_uid_rdd
from pyspark import SparkContext
def get_prime_rdd(train_or_test,sc, p1,p2,p3,p4):
if train_or_test == 'train':
inside_path_a = p1
inside_path_b = p2
elif train_or_test == 'test':
inside_path_a = p3
inside_path_b = p4
else:
print("only input train or test")
return 0,0 sc = sc
train_weibo_raw_data = sc.textFile(inside_path_a)
train_weibo_raw_data_count = train_weibo_raw_data.count()
train_weibo_raw_data_rdd = train_weibo_raw_data.map(lambda x: x.split("\001"))
w_id=train_weibo_raw_data_rdd.map(lambda x:x[0])
u_id=train_weibo_raw_data_rdd.map(lambda x:x[1])
wid_uid_rdd = w_id.zip(u_id) train_weibo_repost_data = sc.textFile(inside_path_b)
train_weibo_repost_data_count = train_weibo_repost_data.count()
train_weibo_repost_data_rdd = train_weibo_repost_data.map(lambda x: x.split("\001"))
repost_id = train_weibo_repost_data_rdd.map(lambda x: x[0])
repost_line_time = train_weibo_repost_data_rdd.map(lambda x: x[1:-1])
repost_id_line_time = repost_id.zip(repost_line_time)
repost_id_line_time_reduce = repost_id_line_time.groupByKey().mapValues(list) repost_id_line_time_reduce = repost_id_line_time_reduce.subtractByKey(repost_id_line_time_reduce.subtractByKey(wid_uid_rdd))
wid_uid_rdd = wid_uid_rdd.subtractByKey(wid_uid_rdd.subtractByKey(repost_id_line_time_reduce)) return repost_id_line_time_reduce,wid_uid_rdd
def get_uid_fnum_rdd(sc,p5):
sc = sc
user_relations_data = sc.textFile(p5)
user_relations_data_count = user_relations_data.count()
user_relations_data_rdd_1 = user_relations_data.map(lambda x: x.split("\t")[0])
user_relations_data_rdd_2 = user_relations_data.map(lambda x: x.split("\t")[1])
user_relations_data_rdd_user = user_relations_data_rdd_1
user_relations_data_rdd_fans = user_relations_data_rdd_2.map(lambda x: x.split("\x01"))
user_fans = user_relations_data_rdd_user.zip(user_relations_data_rdd_fans)
fans_nums = user_relations_data_rdd_fans.map(lambda s:len(s))
uid_fnum_rdd = user_fans.keys().zip(fans_nums)
return uid_fnum_rdd
##版本 2  分时间段计算指定时间段的转发量
def cal_times_j(list,j):
ct = 0
for i in range(len(list)):
#if int(list[i][-1]) >= j*900 and int(list[i][-1]) <= (j+1)*900:
#这里可以切换求累计的转发量还是区间的转发量
if int(list[i][-1]) <= (j)*900:
ct += 1
return ct
def cal_id_times_j(rdd,j):
times = rdd.values().map(lambda x: cal_times_j(x,j))
rdd = rdd.keys().zip(times)
return rdd def generate_times_file(rdd,k,path):
for j in range(k-1,k+1):
import csv
a_path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
#print(path)
out_file_train_times_j = open(a_path,'w')
writer = csv.writer(out_file_train_times_j);
zhuanfa = cal_id_times_j(rdd,j+1)
for lists in zhuanfa.collect():
writer.writerow(lists)
out_file_train_times_j.close()
#计算深度
#定义函数,计算出指定阶段的,发生过的转发关系
def cal_during(list,j):
new_list=[]
for i in range(len(list)):
if int(list[i][-1]) <= j*900:
new_list.append(list[i])
return new_list #定义函数,计算一个rdd中,指定阶段,发生过的转发关系
def cal_rdd_during(rdd,j):
return rdd.map(lambda x: cal_during(x,j)) #定义函数,如果一个转发关系的尾部,是另外一个转发关系的头,那么久把这个头的尾部,加到这个转发关系的尾部
def add_deep(list):
kkk = len(list)
if kkk<=1:
return list
else:
for i in range(kkk):
for j in range(kkk):
if list[i][-1] == list[j][0]:
list[i].append(list[j][-1])
return list #定义函数返回序列中的数组的最长的值,作为最大的深度
def max_deep(list):
max=2
if len(list)==0:
return 0
else:
for i in range(len(list)):
max = (len(list[i]) if len(list[i])> max else max)
return max-1 #定义函数,取出其中的两列
def ti_qu(list):
for i in range(len(list)):
list[i] = list[i][:-1]
return list def cal_cal(all_in_one_rdd, j):
id_rdd = all_in_one_rdd.keys() #获取ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
line_rdd_j_extend = line_rdd_j.map(lambda x: add_deep(x))#延长转发关系
line_rdd_j_extend_maxdeep = line_rdd_j_extend.map(lambda x:max_deep(x))#计算最大深度
id_deep_rdd_j = id_rdd.zip(line_rdd_j_extend_maxdeep)#组合微博ID与深度
return id_deep_rdd_j def generate_deeps_file(rdd,k,path):
import csv
for j in range(k-1,k+1):
b_path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
#print(path)
out_file_train_deeps_j = open(b_path,'w')
writer = csv.writer(out_file_train_deeps_j);
shendu = cal_cal(rdd,j+1)
for lists in shendu.collect():
writer.writerow(lists)
out_file_train_deeps_j.close()
def get_wid_fnum_rdd(uid_fnum_rdd,wid_uid_rdd,path):
#print("用户和粉丝个数的对应关系,取出来一个看看:")
#print(uid_fnum_rdd.take(3))
#print(uid_fnum_rdd.count())
#print("\n训练原始约减微博的id和发送微博的人的id的对应rdd:")
#print(wid_uid_rdd.take(3))
#print(wid_uid_rdd.count())
uid_wid_rdd = wid_uid_rdd.values().zip(wid_uid_rdd.keys())
uid__wid_fnum = uid_wid_rdd.leftOuterJoin(uid_fnum_rdd)
wid_fnum_rdd = uid__wid_fnum.values().map(lambda x: x[0]).zip(uid__wid_fnum.values().map(lambda x: x[1]))
#print(wid_fnum_rdd.take(2))
#print(wid_fnum_rdd.count())
import csv
c_path = str(path) + 'wid_fnum_file.csv'
wid_fnum_file = open(c_path,"w")
writer = csv.writer(wid_fnum_file);
for lists in wid_fnum_rdd.collect():
writer.writerow(lists);
wid_fnum_file.close() return wid_fnum_rdd
#定义函数,将列表数组扁平化
def add_flat(list):
if list==None:
return 0
else:
kkk = len(list)
list0 = list[0]
for i in range(kkk):
if i==0:
pass
else:
list0 = list0.append(list[i])
return list0 #定义函数,计算覆盖用户数目
def clac_cover(list):
total_cover=0
for i in range(len(list)):
total_cover += cover_value(list[i])
return total_cover #定义函数,计算某个用户的粉丝数:
def cover_value(user):
'''
try:
return uid_fnum_dict[user]
except:
return 0
'''
for i in range(len(list_uid_fnum)):
if user == list_uid_fnum[i][0]:
return list_uid_fnum[i][1]
else:
return 0
def flatmapvalues(x):
return x def cal_sum(x):
sum = 0
if x==None and len(x)==0:
return sum
else:
for i in range(len(x)):
if x[i]== None:
pass
else:
sum += int(x[i])
return sum def fans_cover_till_j(all_in_one_rdd,j):
id_rdd = all_in_one_rdd.keys() #获取微博ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
#print("\n指定时间段,获取这个时间段发生过的转发和时间组成的RDD");print(line_time_rdd_j.first())
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
#print("\n提取转发关系");print(line_rdd_j.first()) #line_rdd_j.flatMap(lambda x: re.sub(r'\D'," ",x).split())
#line_rdd_j_flat = line_rdd_j.map(lambda x: add_flat(x))#扁平化转发关系,不行
import re
line_rdd_j_flat = line_rdd_j.map(lambda x: re.sub(r'\D'," ",str(x)).split())#扁平化转发关系
#print("\n提取扁平化的转发关系");print(line_rdd_j_flat.first()) line_rdd_j_flat_disc = line_rdd_j_flat.map(lambda x:list(set(list(x)))) #扁平化之后约减重复的用户ID
#print("\n看看去重之后的转发用户");print(line_rdd_j_flat_disc.first()) fans_cover_rdd_j = id_rdd.zip(line_rdd_j_flat_disc)
#print("\n看看去重之后的微博ID和转发用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.flatMapValues(flatmapvalues)
#print("\n看看去重之后的微博ID和转发用户,一对一flatmap之后");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.values().zip(fans_cover_rdd_j.keys())
#print("\n翻转id和用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.leftOuterJoin(uid_fnum_rdd).values()
#print("\n得到用户id_(微博ID,粉丝)");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.map(lambda x: x[0]).zip(fans_cover_rdd_j.map(lambda x:x[1]))
#print("\n得微博id_粉丝");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.groupByKey().mapValues(list)
#print("\n组合,");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
#print("\nmap求和");print(fans_cover_rdd_j.first()) #cover_rdd = line_rdd_j_flat_disc.map(lambda x: clac_cover(x))
#fans_cover_rdd_j = id_rdd.zip(cover_rdd)#组合微博ID与覆盖数目
#print(id_deep_rdd_j.first())
#return line_rdd_j_extend_maxdeep
temp_key_0 = all_in_one_rdd.keys().zip(all_in_one_rdd.values().map(lambda x: 0)) fans_cover_rdd_j = temp_key_0.leftOuterJoin(fans_cover_rdd_j)
fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x))) return fans_cover_rdd_j def generate_covers_file(rdd,k,path):
#按理说没问题
import csv
for j in range(k-1,k+1):
c_path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
#print(c_path)
out_file_train_covers_j = open(c_path,'w')
writer = csv.writer(out_file_train_covers_j)
covers = fans_cover_till_j(rdd,j+1)
for lists in covers.collect():
writer.writerow(lists)
out_file_train_covers_j.close()
px,py,pz,p1,p2,p3,p4,p5 = get_basic_info()
uid_fnum_rdd = get_uid_fnum_rdd(sc,p5)
train_repost_id_line_time_reduce, train_wid_uid_rdd = get_prime_rdd('train',sc,p1,p2,p3,p4)
#wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,train_wid_uid_rdd,px)
#generate_times_file(train_repost_id_line_time_reduce,292,px)
#generate_deeps_file(train_repost_id_line_time_reduce,292,px)
#generate_covers_file(train_repost_id_line_time_reduce,292,px) test_repost_id_line_time_reduce, test_wid_uid_rdd = get_prime_rdd('test',sc,p1,p2,p3,p4)
#test_wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,test_wid_uid_rdd,py)
#generate_times_file(test_repost_id_line_time_reduce,16,py)
#generate_deeps_file(test_repost_id_line_time_reduce,16,py)
#generate_covers_file(test_repost_id_line_time_reduce,16,py)
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import SparseVector,DenseVector #获取用户ID和粉丝数的对比
def get_wid_fnum_rdd(path):
path = path+ 'wid_fnum_file'+'.csv'
wid_fnum_rdd = sc.textFile(path)
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x.split(","))
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x[0]).zip(wid_fnum_rdd.map(lambda x:x[1]))
wid_fnum_rdd = wid_fnum_rdd.sortByKey()
return wid_fnum_rdd def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd #获取其他三个需要的参数
def get_wid_x(j,path,times_or_deeps_or_covers):
if times_or_deeps_or_covers == 'times':
if path == px:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'deeps':
if path == px:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'covers':
if path == px:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/covers_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
else:
print('wrong input about times_or_deeps_or_covers')
return 0
rdd = sc.textFile(path)
rdd = rdd.map(lambda x:x.split(","))
rdd = rdd.map(lambda x:x[0]).zip(rdd.map(lambda x:x[1]))
rdd = rdd.sortByKey()
return rdd #将两个RDDjoin返回一个rdd的函数
def my_join(rdd1,rdd2):
import re
rdd = rdd1.join(rdd2).keys().zip(rdd1.join(rdd2).values().map(lambda x:re.sub(r'\D'," ",str(x)).split()))
return rdd #根据rdd的元素制作lib_svm格式文件
def lib_svm(x):
str1 = str(x[0] + ' ')
for i in range(len(x)):
if i == 0:
pass
else:
str1 += str(str(i) + ":" +str(x[i])+ ' ')
return str1 #生成测试或者训练需要的数据
def generate_train_or_test_data(path,j,times_or_deeps):
if times_or_deeps == 'times':
if path == px:
data_path = str(px) + 'train_data/times_train_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j+1,path,'times')
elif path == py:
data_path = str(py) + 'test_data/times_test_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j,path,'times')
#print(wid_times_rdd.count())
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_times_rdd,wid_fnum_rdd)
records = my_join(records,wid_deeps_rdd)
records = my_join(records,wid_covers_rdd)
records = records.sortByKey()
#print('看看训练集合中的keys()的顺序-------------------------------------------')
#print(records.keys().take(10))
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
elif times_or_deeps == 'deeps':
if path == px:
data_path = str(px) + 'train_data/deeps_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/deeps_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_deeps_rdd = get_wid_x(j,path,'deeps')
else:
wid_deeps_rdd = get_wid_x(j+1,path,'deeps')
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_deeps_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_covers_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
elif times_or_deeps == 'covers':
if path == px:
data_path = str(px) + 'train_data/covers_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/covers_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_covers_rdd = get_wid_x(j,path,'covers')
else:
wid_covers_rdd = get_wid_x(j+1,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps') records = my_join(wid_covers_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_deeps_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
else:
return 0 #生成指定时段的预测结果
def generate_test_predict(j,times_or_deeps):
if times_or_deeps == 'times':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'times_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'times_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'times_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'deeps':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'deeps_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'deeps_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'deeps_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'covers':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'covers_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'covers_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'covers_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close() def generate_test_data_beyond15(j):
path = '/home/jason/spark/weibo_predict/predicts/'+'time_data_'+str(j)+'.txt'
rdd2 = sc.textFile(path)
rdd1 = get_wid_fnum_rdd(py).keys()
rdd = rdd1.zip(rdd2)
return rdd
def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd
for i in range(15):
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
for i in range(15,292):
print(i)
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
generate_train_or_test_data(px,291,'times')
generate_train_or_test_data(py,291,'times')
generate_test_predict(291,'times')
generate_train_or_test_data(px,291,'deeps')
generate_train_or_test_data(py,291,'deeps')
generate_test_predict(291,'deeps')
generate_train_or_test_data(px,291,'covers')
generate_train_or_test_data(py,291,'covers')
generate_test_predict(291,'covers')
#组团搞出来最后的文件

rdd1 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(1)+'.txt')
rdd1 = add_keys(rdd1)
for j in range(4,292):
j = j+1
if j==1:
pass
else:
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt')
rdd2 = add_keys(rdd2)
rdd1 = my_join(rdd1,rdd2) for j in range(4,292):
j=j+1
rdd3 = sc.textFile('/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt')
rdd3 = add_keys(rdd3)
rdd1 = my_join(rdd1,rdd3) def add_head(x):
str1 = 'testWeibo'
str1 = str1+str(x)
return str1 import re
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split())
rdd1 = rdd1.sortBy(lambda x: int(x[0]))
rdd1 = rdd1.map(lambda x:x[0]).zip(rdd1.map(lambda x:x[1:]))
rdd1_key = rdd1.keys().map(lambda x:add_head(x))
rdd1 = rdd1_key.zip(rdd1.values())
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split()) import csv
path = '/home/jason/spark/weibo_predict/'
end_path = str(path) + 'end_of_end.csv'
end_f = open(end_path,'w')
writer = csv.writer(end_f)
for lists in rdd1.collect():
writer.writerow(lists)
end_f.close()
a=','
s1 = ['scaleT'+str((i+1)*15) for i in range(4,292)]
s1 = a.join(s1)
s2 = ['depthT'+str((i+1)*15) for i in range(4,292)]
s2 = a.join(s2)
s3 = 'WeiboID (Time Unit: Minutes)'+a+s1+s2
#print(s3)
end_path_2 = '/home/jason/spark/weibo_predict/end_of_end.csv'
end_path_1 = '/home/jason/spark/weibo_predict/end_of_end_.csv'
rdd = sc.textFile(end_path_2)
rdd = rdd.map(lambda x:add_head(x))
end_ff = open(end_path_1,'w')
end_ff.write(s3)
end_ff.write('\n')
for lists in rdd.collect():
end_ff.write(lists)
end_ff.write('\n')
end_ff.close()

shape into blocks--source code in python based on pySpark的更多相关文章

  1. Source Code Structure - Python 源码目录结构

    Source Code Structure - Python 源码目录结构 Include 目录包含了 Python 提供的所有头文件, 如果用户需要用 C 或 C++ 编写自定义模块扩展 Pytho ...

  2. convert source code files to pdf format in python

    import os import sys def find_file(root_dir, type): dirs_pool = [root_dir] dest_pool = [] def scan_d ...

  3. Defining Python Source Code Encodings

    Defining the Encoding Python will default to ASCII as standard encoding if no other encoding hints a ...

  4. Python使用import导入模块时报ValueError: source code string cannot contain null bytes的解决方案

    老猿在导入一个Python模块时报错: >>> import restartnet.py Traceback (most recent call last): File " ...

  5. 从Script到Code Blocks、Code Behind到MVC、MVP、MVVM

    刚过去的周五(3-14)例行地主持了技术会议,主题正好是<UI层的设计模式——从Script.Code Behind到MVC.MVP.MVVM>,是前一天晚上才定的,中午花了半小时准备了下 ...

  6. Tips for newbie to read source code

    This post is first posted on my WeChat public account: GeekArtT Reading source code is always one bi ...

  7. XML.ObjTree -- XML source code from/to JavaScript object like E4X

    转载于:http://www.kawa.net/works/js/xml/objtree-try-e.html // ========================================= ...

  8. Indenting source code

    Artistic Style 1.15.3 A Free , Fast and Small Automatic Formatterfor C , C++ , C# , Java Source Code ...

  9. Source Code Review

    1.berfore we talking abnout the Source Code review,here's what we want to know about the most popula ...

随机推荐

  1. selenium验证码处理

    在爬虫过程中经常遇到验证码,如何处理验证码就显得很重要 现在来说貌似没有完美的解决方案,很多都是通过第三方平台来实现验证码的验证 将获取的验证码的url发送到第三方平台,接收平台返回的验证码,貌似很简 ...

  2. 为 Macbook 安装 enca 命令

    enca 是个查看.转换编码的开源软件, 地址: http://dl.cihar.com/enca/ 安装的步骤比安装 wget 简单多了: 1.用 wget 获取最新的enca 的 tar.gz 的 ...

  3. 颜色表及html代码

    颜色名称及色样表(HTML版)   颜色名 中文名称 Hex RGB 十进制 Decimal     LightPink 浅粉红 #FFB6C1 255,182,193     Pink 粉红 #FF ...

  4. Linux系统调用---同步IO: sync、fsync与fdatasync【转】

    转自:http://blog.csdn.net/cywosp/article/details/8767327 [-] 1  write不够需要fsync 2 fsync的性能问题与fdatasync ...

  5. ORACLE添加表约束的语法示例

    转自:http://jingyan.baidu.com/article/f54ae2fccda68d1e93b84942.html 示例: --班级表 CREATE TABLE TCLASS( cl_ ...

  6. Docker 端口映射问题解决

    在操作Docker容器时发现了其一个端口映射的BUG,具体表现为:开启容器时做了端口映射80:8080,即宿主机的80端口映射到容器内部的8080Jboss端口.一开始测试也没有什么问题,都可以联通, ...

  7. 第一个应用程序HelloWorld

    iOS7 Beta已经发布了,迫不及待地下载了iOS 7及Xcode 5并体验了一下.先做一个简单的Hello World看看都有哪些变化吧.1. 启动Xcode5-DP:2. 从菜单选择File-N ...

  8. JavaEE基础(七)

    1.面向对象(构造方法Constructor概述和格式) A:构造方法概述和作用 给对象的数据(属性)进行初始化 B:构造方法格式特点 a:方法名与类名相同(大小也要与类名一致) b:没有返回值类型, ...

  9. ACM题目————最短路径问题

    Description 给你n个点,m条无向边,每条边都有长度d和花费p,给你起点s终点t,要求输出起点到终点的最短距离及其花费,如果最短距离有多条路线,则输出花费最少的. Input 输入n,m,点 ...

  10. Mix and Build(简单DP)

    Mix and Build Time Limit: 5000MS Memory Limit: 65536K Total Submissions: 3936 Accepted: 1203 Case Ti ...