Intern---Microsoft Academic China Team
项目二:
AEther:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999 //Used for tracking history REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; UnifiedViewRaw = VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view" PARAMS ( Start = @"2016-07-26", End = @"2016-07-26", Source = @"All" // Source = @"DesktopIE" ); ClickData = SELECT Page_FromPage.Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl, COUNT() AS Count FROM UnifiedViewRaw WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn" HAVING Count >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; // Page_FromPage.IsQuery: True if the page is a query page // Vertical: Search Vertical of this PageView // Request_IsQuery bool: True if this page view is search engine result page OUTPUT TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class CMyUtils { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class TopReducer : Reducer { public override Schema Produces(string[] columns, string[] args, Schema input) { return input.Clone(); } public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args) { ; foreach (Row row in input.Rows) { ) { row.Copy(output); yield return output; } } } }
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16 //Used for tracking history REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll"; RESOURCE "/local/IndexQualityCJK/wb/unzip.exe"; RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe"; REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll"; REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING MS.Internal.Bing.DataMining.SearchLogApi; USING Microsoft.Live.Json; SlapiPageView = VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view" //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view" //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile"); PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile"); ZHCNTraffic = SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query, Request_RequestTime.ToString("yyyy-MM-dd") AS QDate, Page_Entities_WebResults FROM SlapiPageView WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn" AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web"; //AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI"); ProcessWebEntity = PROCESS ZHCNTraffic USING FEXLogSimpleExtractor; ClickQueryUrlPairs = SELECT Query, RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl, COUNT() AS PairCount FROM ProcessWebEntity WHERE Click > HAVING PairCount >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null; OUTPUT TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; using MS.Internal.Bing.DataMining.SearchLogApi; public class URLUtility { static public string NormalizeURL(string url) { url = url.ToLower(); if (url.StartsWith("http://")) { url = url.Substring("http://".Length); } else if (url.StartsWith("https://")) { url = url.Substring("https://".Length); } if (url.StartsWith("www.")) { url = url.Substring("www.".Length); } if (url.EndsWith("/")) { url = url.Substring(, url.Length - ); } return url; } static public string GetHost(string url) { url = NormalizeURL(url); int slashPosition = url.IndexOf('/'); ) { url = url.Substring(, slashPosition); } return url; } } public class FEXLogSimpleExtractor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Query = row["Query"].String; string QueryDate = row["QDate"].String; var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList; ; i < WebEntities.Count;i++ ) { string Url = WebEntities[i].TitleUrl; string Host = URLUtility.GetHost(Url); int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion; int Click = WebEntities[i].Clicks.Count; ) { output["Query"].Set(Query); output["QueryDate"].Set(QueryDate); output["Url"].Set(Url); output["Host"].Set(Host); output["POS"].Set(Pos); output["Click"].Set(Click); yield return output; } } } } }
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b //Used for tracking history REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll"; REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll"; RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll"; USING RetroIndex; Snapshot = VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view" PARAMS ( Sample = false, TierFlag = ); SELECT Url, Header, Body, HttpHeader, CodePage FROM Snapshot; Uberchunk = PROCESS PRODUCE Url, Country, Language, Category USING RetroIndexProcessor HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"; OUTPUT TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System; using System.Collections.Generic; using System.IO; using System.Text; using ScopeRuntime; public class Utility { public static bool CJKVersionMobileFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if(cate.StartsWith("aa00") && (cate.EndsWith("Mobi") || cate.EndsWith("CrossDevice") || cate.EndsWith("MobileFriendly") || cate.EndsWith("MobileUnFriendly"))) { return true; } } return false; } public static bool CJKVersionMobileUnFriendly(string category) { if (string.IsNullOrEmpty(category)) { return false; } string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries); foreach (string cate in cates) { if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly")) { return true; } } return false; } } public class CJKVersionMobileOkClassifierProcessor : Processor { public override Schema Produces(string[] columns, string[] args, Schema input) { return new Schema("Url:string, MobileClassifier:int"); } public override IEnumerable<Row> Process(RowSet input, Row output, string[] args) { foreach (Row row in input.Rows) { string Url = row["Url"].String; string Language = row["Language"].String; if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs")) { continue; } //classifier features string Category = row["Category"].String; string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String; string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String; string DUV2_MobileUrl = row["DUV2_MobileUrl"].String; string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String; string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String; string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String; string SpamJunkRuleID = row["SpamJunkRuleID"].String; string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1; ; if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false || string.IsNullOrEmpty(DUV2_MobileUrl) == false || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false || Utility.CJKVersionMobileFriendly(Category) || (")) { MobileClassifier = ; } ")) { MobileClassifier = ; } ") { MobileClassifier = ; } output["Url"].Set(Url); output["MobileClassifier"].Set(MobileClassifier); yield return output; } } }
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,得到Query, Category,ClickCount的对应。
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a //Used for tracking history ie = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv" USING DefaultTextExtractor(); bing = EXTRACT Query : string, Url : string, Count : int FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv" USING DefaultTextExtractor(); union_all = SELECT * FROM ie UNION ALL SELECT * FROM bing; result = SELECT Query, Url, SUM(Count) AS NewCount FROM union_all ORDER BY Query; OUTPUT result TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
7,算分数。
Intern---Microsoft Academic China Team的更多相关文章
- Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud
The Microsoft Research Outreach team has worked extensively with the external research community to ...
- Team Foundation 中的错误和事件消息
Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...
- Microsoft Dynamics CRM 分销行业解决方案
Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...
- Azure China (4) 管理Azure China Storage Account
<Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...
- Azure China (1) Azure公有云落地中国
<Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...
- Microsoft server software support for Microsoft Azure virtual machines
http://support.microsoft.com/kb/2721672/en-us Article ID: 2721672 - Last Review: November 22, 2014 ...
- 如何访问Microsoft Azure Storage
首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...
- Microsoft TFS 如何显示在Windows 的上下文菜单中
How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...
- Microsoft .NET Pet Shop 简介
最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...
随机推荐
- 服务端跨域处理 Cors
1 添加 System.Web.Cors,System.Web.Http.Cors 2 global文件中 注册asp.net 管道事件 protected void Application_Beg ...
- Linux 安装MySQL
安装配置 [root@iZ28gvqe4biZ ~]# rpm -Uvh http://dev.mysql.com/get/mysql-community-release-el7-5.noarch.r ...
- linux-crontab定时任务
crontab命令常见于Unix和Linux的操作系统之中,用于设置周期性被执行的指令.该命令从标准输入设备读取指令,并将其存放于"crontab"文件中,以供之后读取和执行.通常 ...
- Scala中apply的用法
Scala中的 apply 方法有着不同的含义, 对于函数来说该方法意味着调用function本身, 以下说明摘自Programming in Scala, 3rd Edition Every fun ...
- angularjs $emit $on $broadcast 父子 兄弟之间传值
父子之间 <div ng-controller="ParentCtrl"> <div ng-controller="ChildCtrl"> ...
- JAVA与数据库MySQL相连接
JDBC(Java数据库连接体系结构): 是Java实现数据库访问的应用程序编程接口,主要功能是管理存放在数据库中的数据.通过接口对象,应用程序可以完成与数据库的连接,执行SQL语句,从数据库中获取结 ...
- 简单的maven配置
groupId是指com.xx 组织标识 artifactId才是项目名称 2)编译源代码 mvn compile 3)编译测试代码 mvn test-compile 4)清空 mvn clean 5 ...
- python学习笔记(二)
(一)模块打包 ---> 注:suba和subb文件夹下的__init__.py文件,即使为空,也必须存在 "setup.py" from distut ...
- 这两天遇到iphone使用app store下载免费软件,必须验证付款信息才能购物是怎么回事???
答案: 在你这台设备上再设置一下,具体方法是:1.点设置进入2.点iTunes Store 和App Store 3.点 Apple ID ,如果没设置,设置一下,如果有的,再点击 4.出现一上选择的 ...
- webapi8