注意:pig中用run或者exec 运行脚本。除了cd和ls,其他命令不用。在本代码中用rm和mv命令做例子,容易出错。

另外,pig只有在store或dump时候才会真正加载数据,否则,只是加载代码,不具体操作数据。所以在rm操作时必须注意该文件是否已经生成。如果rm的文件为生成,可以第三文件,进行mv改名操作

SET job.name 'test_age_reporth_istorical';-- 定义任务名字,在http://172.XX.XX.XX:50030/jobtracker.jsp中查看任务状态,失败成功。

SET job.priority HIGH;--优先级





--注册jar包,用于读取sequence file和输出分析结果文件

REGISTER piggybank.jar;

DEFINE SequenceFileLoader org.apache.pig.piggybank.storage.SequenceFileLoader(); --读取二进制文件,函数名定义





%default Cleaned_Log /user/C/data/XXX/cleaned/$date/*/part* --$date是外部传入参数





%default AD_Data /user/XXX/data/xxx/metadata/ad/part*

%default Campaign_Data /user/xxx/data/xxx/metadata/campaign/part*

%default Social_Data /user/xxx/data/report/socialdata/part*





--所有的输出文件路径:

%default Industry_Path $file_path/report/historical/age/$year/industry

%default Industry_SUM $file_path/report/historical/age/$year/industry_sum

%default Industry_TMP $file_path/report/historical/age/$year/industry_tmp





%default Industry_Brand_Path $file_path/report/historical/age/$year/industry_brand

%default Industry_Brand_SUM $file_path/report/historical/age/$year/industry_brand_sum

%default Industry_Brand_TMP $file_path/report/historical/age/$year/industry_brand_tmp





%default ALL_Path $file_path/report/historical/age/$year/all

%default ALL_SUM $file_path/report/historical/age/$year/all_sum

%default ALL_TMP $file_path/report/historical/age/$year/all_tmp





%default output_path /user/xxx/tmp/result









origin_cleaned_data = LOAD '$Cleaned_Log' USING PigStorage(',') --读取日志文件

AS (ad_network_id:chararray,

    xxx_ad_id:chararray,

    guid:chararray,

    id:chararray,

    create_time:chararray,

    action_time:chararray,

    log_type:chararray, 

    ad_id:chararray,

    positioning_method:chararray,

    location_accuracy:chararray,

    lat:chararray, 

    lon:chararray,

    cell_id:chararray,

    lac:chararray,

    mcc:chararray,

    mnc:chararray,

    ip:chararray,

    connection_type:chararray,

    android_id:chararray,

    android_advertising_id:chararray,

    openudid:chararray,

    mac_address:chararray,

    uid:chararray,

    density:chararray,

    screen_height:chararray,

    screen_width:chararray,

    user_agent:chararray,

    app_id:chararray,

    app_category_id:chararray,

    device_model_id:chararray,

    carrier_id:chararray,

    os_id:chararray,

    device_type:chararray,

    os_version:chararray,

    country_region_id:chararray,

    province_region_id:chararray,

    city_region_id:chararray,

    ip_lat:chararray,

    ip_lon:chararray,

    quadkey:chararray);





--loading metadata/ad(adId,campaignId) 

metadata_ad = LOAD '$AD_Data' USING PigStorage(',') AS (adId:chararray, campaignId:chararray);





--loading metadata/campaign数据(campaignId, industryId, brandId)

metadata_campaign = LOAD '$Campaign_Data' USING PigStorage(',') AS (campaignId:chararray, industryId:chararray, brandId:chararray);





--ad and campaign for inner join

joinAdCampaignByCampaignId = JOIN metadata_ad BY campaignId,metadata_campaign BY campaignId;--(adId,campaignId,campaignId,industryId,brandId)

--filtering out redundant column of joinAdCampaignByCampaignId

joined_ad_campaign_data = FOREACH joinAdCampaignByCampaignId GENERATE $0 AS adId,$3 AS industryId,$4 AS brandId; --(adId,industryId,brandId)





--extract column for analyzing

origin_historical_age = FOREACH origin_cleaned_data GENERATE xxx_ad_id,guid,log_type;--(xxx_ad_id,guid,log_type)

--distinct

distinct_origin_historical_age = DISTINCT origin_historical_age;--(xxx_ad_id,guid,log_type)





--loading metadata_region(guid_social, sex, age, income, edu, hobby)

metadata_social = LOAD '$Social_Data' USING PigStorage(',') AS (guid_social:chararray, sex:chararray, age:chararray, income:chararray, edu:chararray, hobby:chararray);

--extract needed column in metadata_social

social_age = FOREACH metadata_social GENERATE guid_social,age;





--join socialData(metadata_social) and logData(distinct_origin_historical_age):

joinedByGUID = JOIN social_age BY guid_social, distinct_origin_historical_age BY guid;

--(guid_social, age; xxx_ad_id,guid,log_type)









--generating analyzing age data

joined_orgin_age_data = FOREACH joinedByGUID GENERATE xxx_ad_id,guid,log_type,age;

joinedByAdId = JOIN joined_ad_campaign_data BY adId, joined_orgin_age_data BY xxx_ad_id; --(adId,industryId,brandId,xxx_ad_id,guid,log_type,age)

--filtering

all_current_data = FOREACH joinedByAdId GENERATE guid,log_type,industryId,brandId,age; --(guid,log_type,industryId,brandId,age)





--for industry analyzing

industry_current_data = FOREACH all_current_data GENERATE industryId,guid,age,log_type;  --(industryId,guid,age,log_type)





--load all in the path "industry"

industry_existed_Data = LOAD '$Industry_Path' USING PigStorage(',') AS (industryId:chararray,guid:chararray,age:chararray,log_type:chararray);





--merge with history data 

union_Industry = UNION industry_existed_Data, industry_current_data;

distict_union_industry = DISTINCT union_Industry;

group_industry = GROUP distict_union_industry BY ($2,$0,$3);

count_guid_for_industry = FOREACH group_industry GENERATE FLATTEN(group),COUNT($1.$1);





rm $Industry_SUM;

STORE count_guid_for_industry INTO '$Industry_SUM' USING PigStorage(',');





--storing union industry data(current and history)

STORE distict_union_industry INTO '$Industry_TMP' USING PigStorage(',');

rm $Industry_Path

mv $Industry_TMP $Industry_Path





--counting guid for industry and brand 

industry_brand_current = FOREACH all_current_data GENERATE age,industryId,brandId,log_type,guid;

--(age,industryId,brandId,log_type,guid)





--load history data of industry_brand

industry_brand_history = LOAD '$Industry_Brand_Path' USING PigStorage(',') AS(age:chararray, industryId:chararray, brandId:chararray, log_type:chararray, guid:chararray);





--union all data of industry_brand

union_industry_brand = UNION industry_brand_current,industry_brand_history;

unique_industry_brand = DISTINCT union_industry_brand;

--(age,industryId,brandId,log_type,guid)





--counting users' number for industry and brand

group_industry_brand = GROUP unique_industry_brand BY ($0,$1,$2,$3);

count_guid_for_industry_brand = FOREACH group_industry_brand GENERATE FLATTEN(group),COUNT($1.$4);





rm $Industry_Brand_SUM;

STORE count_guid_for_industry_brand INTO '$Industry_Brand_SUM' USING PigStorage(',');





STORE unique_industry_brand INTO '$Industry_Brand_TMP' USING PigStorage(',');

rm $Industry_Brand_Path;

mv $Industry_Brand_TMP $Industry_Brand_Path





--counting user number for age and logtype

current_data = FOREACH all_current_data GENERATE age,log_type,guid;--(age,log_type,guid)





--load history data of age and logtype

history_data = LOAD '$ALL_Path' USING PigStorage(',') AS(age:chararray,log_type:chararray,guid:chararray);





--union current and history data

union_all_data = UNION history_data, current_data;

unique_all_data = DISTINCT union_all_data;





--count users' number

group_all_data = GROUP unique_all_data BY ($0,$1);

count_guid_for_age_logtype = FOREACH group_all_data GENERATE FLATTEN(group),COUNT($1.$2);





rm $ALL_SUM;

STORE count_guid_for_age_logtype INTO '$ALL_SUM' USING PigStorage(',');





STORE unique_all_data INTO '$ALL_TMP' USING PigStorage(',');

rm $ALL_Path

mv $ALL_TMP $ALL_Path

pig简单的代码实例:报表统计行业中的点击和曝光量的更多相关文章

  1. JS代码实用代码实例(输入框监听,点击显示点击其他地方消失,文件本地预览上传)

    前段时间写前端,遇到一些模块非常有用,总结以备后用 一.input框字数监听 <!DOCTYPE html> <html lang="en"> <he ...

  2. Redis:安装、配置、操作和简单代码实例(C语言Client端)

    Redis:安装.配置.操作和简单代码实例(C语言Client端) - hj19870806的专栏 - 博客频道 - CSDN.NET Redis:安装.配置.操作和简单代码实例(C语言Client端 ...

  3. Python模拟登陆淘宝并统计淘宝消费情况的代码实例分享

    Python模拟登陆淘宝并统计淘宝消费情况的代码实例分享 支付宝十年账单上的数字有点吓人,但它统计的项目太多,只是想看看到底单纯在淘宝上支出了多少,于是写了段脚本,统计任意时间段淘宝订单的消费情况,看 ...

  4. 使用ssm(spring+springMVC+mybatis)创建一个简单的查询实例(二)(代码篇)

    这篇是上一篇的延续: 用ssm(spring+springMVC+mybatis)创建一个简单的查询实例(一) 源代码在github上可以下载,地址:https://github.com/guoxia ...

  5. c#之GDI简单实现代码及其实例

    作业:文档形式 3到5页理解 1.理解 2.源代码解释(1到2页) 3.实现效果 项目地址: https://github.com/zhiyishou/polyer Demo:https://zhiy ...

  6. 审核流(3)低调奢华,简单不凡,实例演示-SNF.WorkFlow--SNF快速开发平台3.1

    下面我们就从什么都没有,结合审核流进行演示实例.从无到有如何快速完美的实现,然而如此简单.低调而奢华,简单而不凡. 从只有数据表通过SNF.CodeGenerator代码生成器快速生成单据并与审核流进 ...

  7. bzoj P1058 [ZJOI2007]报表统计——solution

    1058: [ZJOI2007]报表统计 Time Limit: 15 Sec  Memory Limit: 162 MB Submit: 4099  Solved: 1390 [Submit][St ...

  8. input文本框实现宽度自适应代码实例

    代码实例如下: <!DOCTYPE html> <html><head><meta charset="utf-8"><meta ...

  9. jQuery实现的鼠标滑过切换图片代码实例

    jQuery实现的鼠标滑过切换图片代码实例:有时候网页需要这样的简单效果,那就是当鼠标滑过默认图片的时候,能够实现图片的切换,可能在实际应用中,往往没有这么简单,不过大家可以自行扩展一下,下面简单介绍 ...

随机推荐

  1. Kafka,Mq,Redis作为消息队列使用时的差异?

    redis 消息推送(基于分布式 pub/sub)多用于实时性较高的消息推送,并不保证可靠.其他的mq和kafka保证可靠但有一些延迟(非实时系统没有保证延迟).redis-pub/sub断电就清空, ...

  2. Java常量初始化后不会再去重新获取

    Java虚拟机编译机制:更改常量部分 最近一个Java项目中需要修改一个静态常量的值,本地修改编译以后调试正常,然后把对应的entity类的class文件上传到服务器对应的目录以后系统依旧我行我素,各 ...

  3. Asp.Net Core 2.0 项目实战(7)MD5加密、AES&DES对称加解密

    本文目录 1. 摘要 2. MD5加密封装 3. AES的加密.解密 4. DES加密/解密 5. 总结 1.  摘要 C#中常用的一些加密和解密方案,如:md5加密.RSA加密与解密和DES加密等, ...

  4. Python的一个解释凯撒密码的程序

    #!/usr/bin/env python # -*- coding: utf-8 -*- ''' { Title:CaserCode Author:naiquan Type:crypto Detai ...

  5. VSCode 插件推荐

    vscode-icons  用于项目中文件类型显示对应的图标,提高文件定位的效率. vscode-tslint  用于 TS 的规范检测 Path Intellisense  用于提示导入文件时候的路 ...

  6. 为什么《Dive into Python》不值得推荐

    2010 年 5 月 5 日更新:我翻译了一篇<<Dive Into Python>非死不可>作为对本文观点的进一步支持和对评论的回复,请见:http://blog.csdn. ...

  7. 自定义下拉刷新上拉加载View

    MainActivity.java package com.heima52.pullrefresh; import java.util.ArrayList; import com.heima52.pu ...

  8. 计算机网络之IP地址

    IP地址的分类 整个的因特网就是一个单一的.抽象的网络.IP地址就是给因特网上的每一个主机(或路由器)的每一个接口分配一个在全世界范围内唯一的32位的标识符. 所谓分类的IP地址,就是将IP地址划分为 ...

  9. Android开发艺术探索——第二章:IPC机制(中)

    Android开发艺术探索--第二章:IPC机制(中) 好的,我们继续来了解IPC机制,在上篇我们可能就是把理论的知识写完了,然后现在基本上是可以实战了. 一.Android中的IPC方式 本节我们开 ...

  10. 给pdf文件添加防伪水印logo(附工程源码下载)

    pdf添加水印logo这种需求场景确实很少,有些时候一些销售单据生成pdf添加一个水印logo,做一个简单的防伪效果,虽然实际上并没有太大作用,但是产品经理说要,巴拉巴拉--省略一万字. 下面将源码分 ...