这是微博深度和广度预测的原始代码,写了大约半个月,第一个版本不是这样的,但是这个版本包含所有需要的功能。

模块化的程度也更高。找工作前一直想用python完美解决这个问题,后来发现自己的方法和硬件都有很大的局限。

算是我的第一次正儿八经的尝试在分布式计算的框架下,计算海量的数据。

意识到很多问题,影响我面试时候很多的代码风格。

def get_basic_info():
win_path = "E:/spark/weibo_predict/"
linux_path = "/home/jason/spark/weibo_predict/"
path = linux_path train_path = path + 'train/'
test_path = path + 'test/'
code_path = path + 'source_code/' print('\n训练准备文件保存路径px:%s' % train_path)
print('\n测试准备文件保存路径py:%s' % test_path)
print('\n代码准备文件保存路径pz:%s' % code_path) train_weibo_raw_path = path + "train_weibo_raw.txt"
train_weibo_repost_path = path + "train_weibo_repost_back.txt" test_weibo_raw_path = path + "test_weibo_raw.txt"
test_weibo_repost_path = path + "test_weibo_repost.txt" user_relations_path = path + "user_relations_back.txt" print("\n训练原始微博地址p1:%s" % train_weibo_raw_path)
print("训练转发微博地址p2:%s" % train_weibo_repost_path)
print("\n测试原始微博地址p3:%s" % test_weibo_raw_path)
print("测试转发微博地址p4:%s" % test_weibo_repost_path)
print("\n 用户关系地址p5:%s" % user_relations_path)
return train_path,test_path,code_path,train_weibo_raw_path,train_weibo_repost_path,test_weibo_raw_path,test_weibo_repost_path,user_relations_path
#传递  训练(原始微博,转发微博) 或者 测试(原始微博,转发微博)
#返回化简后的对应关系repost_id_line_time_reduce
#返回微博id对应的用户idwid_uid_rdd
from pyspark import SparkContext
def get_prime_rdd(train_or_test,sc, p1,p2,p3,p4):
if train_or_test == 'train':
inside_path_a = p1
inside_path_b = p2
elif train_or_test == 'test':
inside_path_a = p3
inside_path_b = p4
else:
print("only input train or test")
return 0,0 sc = sc
train_weibo_raw_data = sc.textFile(inside_path_a)
train_weibo_raw_data_count = train_weibo_raw_data.count()
train_weibo_raw_data_rdd = train_weibo_raw_data.map(lambda x: x.split("\001"))
w_id=train_weibo_raw_data_rdd.map(lambda x:x[0])
u_id=train_weibo_raw_data_rdd.map(lambda x:x[1])
wid_uid_rdd = w_id.zip(u_id) train_weibo_repost_data = sc.textFile(inside_path_b)
train_weibo_repost_data_count = train_weibo_repost_data.count()
train_weibo_repost_data_rdd = train_weibo_repost_data.map(lambda x: x.split("\001"))
repost_id = train_weibo_repost_data_rdd.map(lambda x: x[0])
repost_line_time = train_weibo_repost_data_rdd.map(lambda x: x[1:-1])
repost_id_line_time = repost_id.zip(repost_line_time)
repost_id_line_time_reduce = repost_id_line_time.groupByKey().mapValues(list) repost_id_line_time_reduce = repost_id_line_time_reduce.subtractByKey(repost_id_line_time_reduce.subtractByKey(wid_uid_rdd))
wid_uid_rdd = wid_uid_rdd.subtractByKey(wid_uid_rdd.subtractByKey(repost_id_line_time_reduce)) return repost_id_line_time_reduce,wid_uid_rdd
def get_uid_fnum_rdd(sc,p5):
sc = sc
user_relations_data = sc.textFile(p5)
user_relations_data_count = user_relations_data.count()
user_relations_data_rdd_1 = user_relations_data.map(lambda x: x.split("\t")[0])
user_relations_data_rdd_2 = user_relations_data.map(lambda x: x.split("\t")[1])
user_relations_data_rdd_user = user_relations_data_rdd_1
user_relations_data_rdd_fans = user_relations_data_rdd_2.map(lambda x: x.split("\x01"))
user_fans = user_relations_data_rdd_user.zip(user_relations_data_rdd_fans)
fans_nums = user_relations_data_rdd_fans.map(lambda s:len(s))
uid_fnum_rdd = user_fans.keys().zip(fans_nums)
return uid_fnum_rdd
##版本 2  分时间段计算指定时间段的转发量
def cal_times_j(list,j):
ct = 0
for i in range(len(list)):
#if int(list[i][-1]) >= j*900 and int(list[i][-1]) <= (j+1)*900:
#这里可以切换求累计的转发量还是区间的转发量
if int(list[i][-1]) <= (j)*900:
ct += 1
return ct
def cal_id_times_j(rdd,j):
times = rdd.values().map(lambda x: cal_times_j(x,j))
rdd = rdd.keys().zip(times)
return rdd def generate_times_file(rdd,k,path):
for j in range(k-1,k+1):
import csv
a_path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
#print(path)
out_file_train_times_j = open(a_path,'w')
writer = csv.writer(out_file_train_times_j);
zhuanfa = cal_id_times_j(rdd,j+1)
for lists in zhuanfa.collect():
writer.writerow(lists)
out_file_train_times_j.close()
#计算深度
#定义函数,计算出指定阶段的,发生过的转发关系
def cal_during(list,j):
new_list=[]
for i in range(len(list)):
if int(list[i][-1]) <= j*900:
new_list.append(list[i])
return new_list #定义函数,计算一个rdd中,指定阶段,发生过的转发关系
def cal_rdd_during(rdd,j):
return rdd.map(lambda x: cal_during(x,j)) #定义函数,如果一个转发关系的尾部,是另外一个转发关系的头,那么久把这个头的尾部,加到这个转发关系的尾部
def add_deep(list):
kkk = len(list)
if kkk<=1:
return list
else:
for i in range(kkk):
for j in range(kkk):
if list[i][-1] == list[j][0]:
list[i].append(list[j][-1])
return list #定义函数返回序列中的数组的最长的值,作为最大的深度
def max_deep(list):
max=2
if len(list)==0:
return 0
else:
for i in range(len(list)):
max = (len(list[i]) if len(list[i])> max else max)
return max-1 #定义函数,取出其中的两列
def ti_qu(list):
for i in range(len(list)):
list[i] = list[i][:-1]
return list def cal_cal(all_in_one_rdd, j):
id_rdd = all_in_one_rdd.keys() #获取ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
line_rdd_j_extend = line_rdd_j.map(lambda x: add_deep(x))#延长转发关系
line_rdd_j_extend_maxdeep = line_rdd_j_extend.map(lambda x:max_deep(x))#计算最大深度
id_deep_rdd_j = id_rdd.zip(line_rdd_j_extend_maxdeep)#组合微博ID与深度
return id_deep_rdd_j def generate_deeps_file(rdd,k,path):
import csv
for j in range(k-1,k+1):
b_path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
#print(path)
out_file_train_deeps_j = open(b_path,'w')
writer = csv.writer(out_file_train_deeps_j);
shendu = cal_cal(rdd,j+1)
for lists in shendu.collect():
writer.writerow(lists)
out_file_train_deeps_j.close()
def get_wid_fnum_rdd(uid_fnum_rdd,wid_uid_rdd,path):
#print("用户和粉丝个数的对应关系,取出来一个看看:")
#print(uid_fnum_rdd.take(3))
#print(uid_fnum_rdd.count())
#print("\n训练原始约减微博的id和发送微博的人的id的对应rdd:")
#print(wid_uid_rdd.take(3))
#print(wid_uid_rdd.count())
uid_wid_rdd = wid_uid_rdd.values().zip(wid_uid_rdd.keys())
uid__wid_fnum = uid_wid_rdd.leftOuterJoin(uid_fnum_rdd)
wid_fnum_rdd = uid__wid_fnum.values().map(lambda x: x[0]).zip(uid__wid_fnum.values().map(lambda x: x[1]))
#print(wid_fnum_rdd.take(2))
#print(wid_fnum_rdd.count())
import csv
c_path = str(path) + 'wid_fnum_file.csv'
wid_fnum_file = open(c_path,"w")
writer = csv.writer(wid_fnum_file);
for lists in wid_fnum_rdd.collect():
writer.writerow(lists);
wid_fnum_file.close() return wid_fnum_rdd
#定义函数,将列表数组扁平化
def add_flat(list):
if list==None:
return 0
else:
kkk = len(list)
list0 = list[0]
for i in range(kkk):
if i==0:
pass
else:
list0 = list0.append(list[i])
return list0 #定义函数,计算覆盖用户数目
def clac_cover(list):
total_cover=0
for i in range(len(list)):
total_cover += cover_value(list[i])
return total_cover #定义函数,计算某个用户的粉丝数:
def cover_value(user):
'''
try:
return uid_fnum_dict[user]
except:
return 0
'''
for i in range(len(list_uid_fnum)):
if user == list_uid_fnum[i][0]:
return list_uid_fnum[i][1]
else:
return 0
def flatmapvalues(x):
return x def cal_sum(x):
sum = 0
if x==None and len(x)==0:
return sum
else:
for i in range(len(x)):
if x[i]== None:
pass
else:
sum += int(x[i])
return sum def fans_cover_till_j(all_in_one_rdd,j):
id_rdd = all_in_one_rdd.keys() #获取微博ID的RDD
line_time_rdd = all_in_one_rdd.values() #获取转发关系和转发时间对应的RDD
line_time_rdd_j = cal_rdd_during(line_time_rdd,j) #指定时间段,获取这个时间段发生过的转发和时间组成的RDD
#print("\n指定时间段,获取这个时间段发生过的转发和时间组成的RDD");print(line_time_rdd_j.first())
line_rdd_j = line_time_rdd_j.map(lambda x : ti_qu(x))#提取转发关系
#print("\n提取转发关系");print(line_rdd_j.first()) #line_rdd_j.flatMap(lambda x: re.sub(r'\D'," ",x).split())
#line_rdd_j_flat = line_rdd_j.map(lambda x: add_flat(x))#扁平化转发关系,不行
import re
line_rdd_j_flat = line_rdd_j.map(lambda x: re.sub(r'\D'," ",str(x)).split())#扁平化转发关系
#print("\n提取扁平化的转发关系");print(line_rdd_j_flat.first()) line_rdd_j_flat_disc = line_rdd_j_flat.map(lambda x:list(set(list(x)))) #扁平化之后约减重复的用户ID
#print("\n看看去重之后的转发用户");print(line_rdd_j_flat_disc.first()) fans_cover_rdd_j = id_rdd.zip(line_rdd_j_flat_disc)
#print("\n看看去重之后的微博ID和转发用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.flatMapValues(flatmapvalues)
#print("\n看看去重之后的微博ID和转发用户,一对一flatmap之后");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.values().zip(fans_cover_rdd_j.keys())
#print("\n翻转id和用户");print(fans_cover_rdd_j.first()) fans_cover_rdd_j = fans_cover_rdd_j.leftOuterJoin(uid_fnum_rdd).values()
#print("\n得到用户id_(微博ID,粉丝)");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.map(lambda x: x[0]).zip(fans_cover_rdd_j.map(lambda x:x[1]))
#print("\n得微博id_粉丝");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.groupByKey().mapValues(list)
#print("\n组合,");print(fans_cover_rdd_j.first())
#print(fans_cover_rdd_j.count()) fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x)))
#print("\nmap求和");print(fans_cover_rdd_j.first()) #cover_rdd = line_rdd_j_flat_disc.map(lambda x: clac_cover(x))
#fans_cover_rdd_j = id_rdd.zip(cover_rdd)#组合微博ID与覆盖数目
#print(id_deep_rdd_j.first())
#return line_rdd_j_extend_maxdeep
temp_key_0 = all_in_one_rdd.keys().zip(all_in_one_rdd.values().map(lambda x: 0)) fans_cover_rdd_j = temp_key_0.leftOuterJoin(fans_cover_rdd_j)
fans_cover_rdd_j = fans_cover_rdd_j.keys().zip(fans_cover_rdd_j.values().map(lambda x: cal_sum(x))) return fans_cover_rdd_j def generate_covers_file(rdd,k,path):
#按理说没问题
import csv
for j in range(k-1,k+1):
c_path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
#print(c_path)
out_file_train_covers_j = open(c_path,'w')
writer = csv.writer(out_file_train_covers_j)
covers = fans_cover_till_j(rdd,j+1)
for lists in covers.collect():
writer.writerow(lists)
out_file_train_covers_j.close()
px,py,pz,p1,p2,p3,p4,p5 = get_basic_info()
uid_fnum_rdd = get_uid_fnum_rdd(sc,p5)
train_repost_id_line_time_reduce, train_wid_uid_rdd = get_prime_rdd('train',sc,p1,p2,p3,p4)
#wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,train_wid_uid_rdd,px)
#generate_times_file(train_repost_id_line_time_reduce,292,px)
#generate_deeps_file(train_repost_id_line_time_reduce,292,px)
#generate_covers_file(train_repost_id_line_time_reduce,292,px) test_repost_id_line_time_reduce, test_wid_uid_rdd = get_prime_rdd('test',sc,p1,p2,p3,p4)
#test_wid_fnum_rdd = get_wid_fnum_rdd(uid_fnum_rdd,test_wid_uid_rdd,py)
#generate_times_file(test_repost_id_line_time_reduce,16,py)
#generate_deeps_file(test_repost_id_line_time_reduce,16,py)
#generate_covers_file(test_repost_id_line_time_reduce,16,py)
from pyspark.mllib.regression import LabeledPoint
import numpy as np
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import SparseVector,DenseVector #获取用户ID和粉丝数的对比
def get_wid_fnum_rdd(path):
path = path+ 'wid_fnum_file'+'.csv'
wid_fnum_rdd = sc.textFile(path)
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x.split(","))
wid_fnum_rdd = wid_fnum_rdd.map(lambda x:x[0]).zip(wid_fnum_rdd.map(lambda x:x[1]))
wid_fnum_rdd = wid_fnum_rdd.sortByKey()
return wid_fnum_rdd def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd #获取其他三个需要的参数
def get_wid_x(j,path,times_or_deeps_or_covers):
if times_or_deeps_or_covers == 'times':
if path == px:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_times/wid_times_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'deeps':
if path == px:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_deeps/wid_deeps_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
elif times_or_deeps_or_covers == 'covers':
if path == px:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif path ==py:
if j>=0 and j<15:
path = str(path) + 'wid_covers/wid_covers_'+str(j)+'.csv'
elif j>=15 and j<=291:
path = '/home/jason/spark/weibo_predict/predicts/covers_time_data_'+str(j)+'.txt'
rdd1 = sc.textFile(path)
rdd = add_keys(rdd1)
return rdd
else:
print('wrong input about times_or_deeps_or_covers')
return 0
rdd = sc.textFile(path)
rdd = rdd.map(lambda x:x.split(","))
rdd = rdd.map(lambda x:x[0]).zip(rdd.map(lambda x:x[1]))
rdd = rdd.sortByKey()
return rdd #将两个RDDjoin返回一个rdd的函数
def my_join(rdd1,rdd2):
import re
rdd = rdd1.join(rdd2).keys().zip(rdd1.join(rdd2).values().map(lambda x:re.sub(r'\D'," ",str(x)).split()))
return rdd #根据rdd的元素制作lib_svm格式文件
def lib_svm(x):
str1 = str(x[0] + ' ')
for i in range(len(x)):
if i == 0:
pass
else:
str1 += str(str(i) + ":" +str(x[i])+ ' ')
return str1 #生成测试或者训练需要的数据
def generate_train_or_test_data(path,j,times_or_deeps):
if times_or_deeps == 'times':
if path == px:
data_path = str(px) + 'train_data/times_train_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j+1,path,'times')
elif path == py:
data_path = str(py) + 'test_data/times_test_data_'+str(j)+'.txt'
wid_times_rdd = get_wid_x(j,path,'times')
#print(wid_times_rdd.count())
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_times_rdd,wid_fnum_rdd)
records = my_join(records,wid_deeps_rdd)
records = my_join(records,wid_covers_rdd)
records = records.sortByKey()
#print('看看训练集合中的keys()的顺序-------------------------------------------')
#print(records.keys().take(10))
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
elif times_or_deeps == 'deeps':
if path == px:
data_path = str(px) + 'train_data/deeps_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/deeps_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_deeps_rdd = get_wid_x(j,path,'deeps')
else:
wid_deeps_rdd = get_wid_x(j+1,path,'deeps')
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps')
wid_covers_rdd = get_wid_x(j,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
records = my_join(wid_deeps_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_covers_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
elif times_or_deeps == 'covers':
if path == px:
data_path = str(px) + 'train_data/covers_train_data_'+str(j)+'.txt'
elif path == py:
data_path = str(py) + 'test_data/covers_test_data_'+str(j)+'.txt'
else:
return 0
wid_fnum_rdd = get_wid_fnum_rdd(path)
if path == py:
wid_covers_rdd = get_wid_x(j,path,'covers')
else:
wid_covers_rdd = get_wid_x(j+1,path,'covers')
#wid_covers_rdd = wid_covers_rdd.keys().zip(wid_covers_rdd.values().map(lambda x:float(x)/1000))
wid_times_rdd = get_wid_x(j,path,'times')
wid_deeps_rdd = get_wid_x(j,path,'deeps') records = my_join(wid_covers_rdd,wid_fnum_rdd)
records = my_join(records,wid_times_rdd)
records = my_join(records,wid_deeps_rdd)
records = records.values()
data = records.map(lambda x:lib_svm(x))
open_data_path = open(data_path,'w')
for lines in data.collect():
open_data_path.write(lines)
open_data_path.write('\n')
open_data_path.close()
else:
return 0 #生成指定时段的预测结果
def generate_test_predict(j,times_or_deeps):
if times_or_deeps == 'times':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'times_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'times_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'times_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'deeps':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'deeps_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'deeps_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'deeps_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close()
elif times_or_deeps == 'covers':
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
tr_path = '/home/jason/spark/weibo_predict/train/train_data/'+'covers_train_data_'+str(j)+'.txt'
te_path = '/home/jason/spark/weibo_predict/test/test_data/'+'covers_test_data_'+str(j)+'.txt'
train_data = MLUtils.loadLibSVMFile(sc,tr_path)
test_data = MLUtils.loadLibSVMFile(sc,te_path)
model = RandomForest.trainRegressor(train_data, categoricalFeaturesInfo={},
numTrees=3, featureSubsetStrategy="auto",
impurity='variance', maxDepth=4, maxBins=32,seed=42)
predictions = model.predict(test_data.map(lambda x: x.features))
pre_path = '/home/jason/spark/weibo_predict/predicts/'+'covers_time_data_'+str(j+1)+'.txt'
times_predict = open(pre_path,'w')
for lines in predictions.collect():
times_predict.write(str(int(lines)))
times_predict.write('\n')
times_predict.close() def generate_test_data_beyond15(j):
path = '/home/jason/spark/weibo_predict/predicts/'+'time_data_'+str(j)+'.txt'
rdd2 = sc.textFile(path)
rdd1 = get_wid_fnum_rdd(py).keys()
rdd = rdd1.zip(rdd2)
return rdd
def add_keys(rdd1):
rdd1 = rdd1
#path = '/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(15)+'.txt'
#rdd1 = sc.textFile(path)
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/test/wid_times/wid_times_0.csv')
rdd2 = rdd2.map(lambda x:x.split(',')[0]).zip(rdd2.map(lambda x:x.split(',')[1]))
rdd2 = rdd2.sortByKey()
rdd1 = rdd1.zipWithIndex()
rdd1 = rdd1.values().zip(rdd1.keys())
rdd2 = rdd2.keys().zipWithIndex()
rdd2 = rdd2.values().zip(rdd2.keys())
rdd = rdd2.join(rdd1)
rdd = rdd.values()
rdd = rdd.map(lambda x: x[0]).zip(rdd.map(lambda x: x[1]))
return rdd
for i in range(15):
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
for i in range(15,292):
print(i)
generate_train_or_test_data(px,i,'times')
generate_train_or_test_data(py,i,'times')
generate_test_predict(i,'times')
generate_train_or_test_data(px,i,'deeps')
generate_train_or_test_data(py,i,'deeps')
generate_test_predict(i,'deeps')
generate_train_or_test_data(px,i,'covers')
generate_train_or_test_data(py,i,'covers')
generate_test_predict(i,'covers')
generate_train_or_test_data(px,291,'times')
generate_train_or_test_data(py,291,'times')
generate_test_predict(291,'times')
generate_train_or_test_data(px,291,'deeps')
generate_train_or_test_data(py,291,'deeps')
generate_test_predict(291,'deeps')
generate_train_or_test_data(px,291,'covers')
generate_train_or_test_data(py,291,'covers')
generate_test_predict(291,'covers')
#组团搞出来最后的文件

rdd1 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(1)+'.txt')
rdd1 = add_keys(rdd1)
for j in range(4,292):
j = j+1
if j==1:
pass
else:
rdd2 = sc.textFile('/home/jason/spark/weibo_predict/predicts/times_time_data_'+str(j)+'.txt')
rdd2 = add_keys(rdd2)
rdd1 = my_join(rdd1,rdd2) for j in range(4,292):
j=j+1
rdd3 = sc.textFile('/home/jason/spark/weibo_predict/predicts/deeps_time_data_'+str(j)+'.txt')
rdd3 = add_keys(rdd3)
rdd1 = my_join(rdd1,rdd3) def add_head(x):
str1 = 'testWeibo'
str1 = str1+str(x)
return str1 import re
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split())
rdd1 = rdd1.sortBy(lambda x: int(x[0]))
rdd1 = rdd1.map(lambda x:x[0]).zip(rdd1.map(lambda x:x[1:]))
rdd1_key = rdd1.keys().map(lambda x:add_head(x))
rdd1 = rdd1_key.zip(rdd1.values())
rdd1 = rdd1.map(lambda x: re.sub(r'\D'," ",str(x)).split()) import csv
path = '/home/jason/spark/weibo_predict/'
end_path = str(path) + 'end_of_end.csv'
end_f = open(end_path,'w')
writer = csv.writer(end_f)
for lists in rdd1.collect():
writer.writerow(lists)
end_f.close()
a=','
s1 = ['scaleT'+str((i+1)*15) for i in range(4,292)]
s1 = a.join(s1)
s2 = ['depthT'+str((i+1)*15) for i in range(4,292)]
s2 = a.join(s2)
s3 = 'WeiboID (Time Unit: Minutes)'+a+s1+s2
#print(s3)
end_path_2 = '/home/jason/spark/weibo_predict/end_of_end.csv'
end_path_1 = '/home/jason/spark/weibo_predict/end_of_end_.csv'
rdd = sc.textFile(end_path_2)
rdd = rdd.map(lambda x:add_head(x))
end_ff = open(end_path_1,'w')
end_ff.write(s3)
end_ff.write('\n')
for lists in rdd.collect():
end_ff.write(lists)
end_ff.write('\n')
end_ff.close()

shape into blocks--source code in python based on pySpark的更多相关文章

  1. Source Code Structure - Python 源码目录结构

    Source Code Structure - Python 源码目录结构 Include 目录包含了 Python 提供的所有头文件, 如果用户需要用 C 或 C++ 编写自定义模块扩展 Pytho ...

  2. convert source code files to pdf format in python

    import os import sys def find_file(root_dir, type): dirs_pool = [root_dir] dest_pool = [] def scan_d ...

  3. Defining Python Source Code Encodings

    Defining the Encoding Python will default to ASCII as standard encoding if no other encoding hints a ...

  4. Python使用import导入模块时报ValueError: source code string cannot contain null bytes的解决方案

    老猿在导入一个Python模块时报错: >>> import restartnet.py Traceback (most recent call last): File " ...

  5. 从Script到Code Blocks、Code Behind到MVC、MVP、MVVM

    刚过去的周五(3-14)例行地主持了技术会议,主题正好是<UI层的设计模式——从Script.Code Behind到MVC.MVP.MVVM>,是前一天晚上才定的,中午花了半小时准备了下 ...

  6. Tips for newbie to read source code

    This post is first posted on my WeChat public account: GeekArtT Reading source code is always one bi ...

  7. XML.ObjTree -- XML source code from/to JavaScript object like E4X

    转载于:http://www.kawa.net/works/js/xml/objtree-try-e.html // ========================================= ...

  8. Indenting source code

    Artistic Style 1.15.3 A Free , Fast and Small Automatic Formatterfor C , C++ , C# , Java Source Code ...

  9. Source Code Review

    1.berfore we talking abnout the Source Code review,here's what we want to know about the most popula ...

随机推荐

  1. python核心编程学习记录之基础知识

    虽然对python的基础知识有所了解,但是为了更深入的学习,要对python的各种经典书籍进行学习 第一章介绍python的优缺点,略过 第二章介绍python起步,第三章介绍python基础,仅记录 ...

  2. gerrit docker运行失败 chown: /var/gerrit/review_site: Permission denied 【已解决】

    Docker Volume 之权限管理(转) - jackluo - 博客园 http://www.cnblogs.com/jackluo/p/5783116.html 为什么在公司电脑没有问题,但在 ...

  3. eclipse运行时编码设置

    eclipse运行时编码设置:

  4. 用C#.NET编写软件注册机

    验证注册码是保护软件产品产权的常用手段.一般过程如下, 1.  软件发行者收集用户特有的信息: 2.  根据用户特有的信息,使用注册机生成注册码并把注册码发给客户: 3.  向软件导入注册码,由软件自 ...

  5. char 型变量中能不能存贮一个中文汉字,为什么?

    char类型可以存储一个中文汉字,因为Java中使用的编码是Unicode(不选择任何特定的编码,直接使用字符在字符集中的编号,这是统一的唯一方法),一个char类型占2个字节(16比特),所以放一个 ...

  6. javascript对象(1)

    Array对象 创建语法:new Array();new Array(size);new Array(element0,element,element3,……,elementn); 具有的属性:con ...

  7. 数据库连接池(DBCP:为数据统一建立一个缓冲池,现在企业开发使用)

    数据库连接池:(里面放了许多连接数据的链接,负责分配,管理,释放数据库连接,可重复使用连接,而不新建  )为数据统一连接建立一个缓冲池,放好了一定数据库连接,使用时在缓冲池里面拿,用完之后再还给缓冲池 ...

  8. 使用Window Live Writer写博客

    1.打开“日志账户”—>“日志选项”. 2.点击“更新账户信息”. 3.输入博客地址,用户名和密码,点击“下一步”. 4.耐心等待片刻... 5.设置“日志昵称”,点击“完成”. 这样就大功告成 ...

  9. 20151124001 关闭C#主窗体弹出是否关闭对话框

    关闭C#主窗体弹出是否关闭对话框 private void Frm_Main_FormClosing(object sender, FormClosingEventArgs e)        {   ...

  10. 二叉搜索树的后序遍历路径(《剑指offer》面试题24)

    题目:输入一个整数数组,判断该数组是不是二叉搜索树的后序遍历序列的结果,如果是,则返回true,如果不是则返回false.假设输入的数组的任意两个数字都互不相同. 分析:在后序遍历得到的序列中,最后一 ...