Intern---Microsoft Academic China Team
项目二:
AEther:
项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。
0,各种关键数据统计:
数据量:1个月数据:about 1000T。
1,对IE的所有浏览搜索的提取代码:
Scope:
//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
PARAMS
(
Start = @"2016-07-26",
End = @"2016-07-26",
Source = @"All"
// Source = @"DesktopIE"
);
ClickData =
SELECT Page_FromPage.Query,
RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
COUNT() AS Count
FROM UnifiedViewRaw
WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
HAVING Count >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;
// Page_FromPage.IsQuery: True if the page is a query page
// Vertical: Search Vertical of this PageView
// Request_IsQuery bool: True if this page view is search engine result page
OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class CMyUtils
{
static public string NormalizeURL(string url)
{
url = url.ToLower();
if (url.StartsWith("http://"))
{
url = url.Substring("http://".Length);
}
else if (url.StartsWith("https://"))
{
url = url.Substring("https://".Length);
}
if (url.StartsWith("www."))
{
url = url.Substring("www.".Length);
}
if (url.EndsWith("/"))
{
url = url.Substring(, url.Length - );
}
return url;
}
static public string GetHost(string url)
{
url = NormalizeURL(url);
int slashPosition = url.IndexOf('/');
)
{
url = url.Substring(, slashPosition);
}
return url;
}
}
public class TopReducer : Reducer
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return input.Clone();
}
public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
{
;
foreach (Row row in input.Rows)
{
)
{
row.Copy(output);
yield return output;
}
}
}
}
2,对bing的所有浏览搜索的提取代码:
Scope:
//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;
SlapiPageView =
VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
//VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
//PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");
ZHCNTraffic =
SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
Page_Entities_WebResults
FROM SlapiPageView
WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");
ProcessWebEntity =
PROCESS ZHCNTraffic
USING FEXLogSimpleExtractor;
ClickQueryUrlPairs =
SELECT Query,
RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
COUNT() AS PairCount
FROM ProcessWebEntity
WHERE Click > HAVING PairCount >= AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;
OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;
public class URLUtility
{
static public string NormalizeURL(string url)
{
url = url.ToLower();
if (url.StartsWith("http://"))
{
url = url.Substring("http://".Length);
}
else if (url.StartsWith("https://"))
{
url = url.Substring("https://".Length);
}
if (url.StartsWith("www."))
{
url = url.Substring("www.".Length);
}
if (url.EndsWith("/"))
{
url = url.Substring(, url.Length - );
}
return url;
}
static public string GetHost(string url)
{
url = NormalizeURL(url);
int slashPosition = url.IndexOf('/');
)
{
url = url.Substring(, slashPosition);
}
return url;
}
}
public class FEXLogSimpleExtractor : Processor
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
}
public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
{
foreach (Row row in input.Rows)
{
string Query = row["Query"].String;
string QueryDate = row["QDate"].String;
var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;
; i < WebEntities.Count;i++ )
{
string Url = WebEntities[i].TitleUrl;
string Host = URLUtility.GetHost(Url);
int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
int Click = WebEntities[i].Clicks.Count;
)
{
output["Query"].Set(Query);
output["QueryDate"].Set(QueryDate);
output["Url"].Set(Url);
output["Host"].Set(Host);
output["POS"].Set(Pos);
output["Click"].Set(Click);
yield return output;
}
}
}
}
}
3,搜索查询和分类的提取代码:
//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
USING RetroIndex;
Snapshot =
VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
PARAMS
(
Sample = false,
TierFlag =
);
SELECT Url,
Header,
Body,
HttpHeader,
CodePage
FROM Snapshot;
Uberchunk =
PROCESS
PRODUCE Url,
Country,
Language,
Category
USING RetroIndexProcessor
HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";
OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";
C#:
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
public class Utility
{
public static bool CJKVersionMobileFriendly(string category)
{
if (string.IsNullOrEmpty(category))
{
return false;
}
string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string cate in cates)
{
if(cate.StartsWith("aa00") &&
(cate.EndsWith("Mobi")
|| cate.EndsWith("CrossDevice")
|| cate.EndsWith("MobileFriendly")
|| cate.EndsWith("MobileUnFriendly")))
{
return true;
}
}
return false;
}
public static bool CJKVersionMobileUnFriendly(string category)
{
if (string.IsNullOrEmpty(category))
{
return false;
}
string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string cate in cates)
{
if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
{
return true;
}
}
return false;
}
}
public class CJKVersionMobileOkClassifierProcessor : Processor
{
public override Schema Produces(string[] columns, string[] args, Schema input)
{
return new Schema("Url:string, MobileClassifier:int");
}
public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
{
foreach (Row row in input.Rows)
{
string Url = row["Url"].String;
string Language = row["Language"].String;
if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
{
continue;
}
//classifier features
string Category = row["Category"].String;
string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
string SpamJunkRuleID = row["SpamJunkRuleID"].String;
string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;
;
if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
|| string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
|| string.IsNullOrEmpty(DUV2_MobileUrl) == false
|| string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
|| Utility.CJKVersionMobileFriendly(Category)
|| ("))
{
MobileClassifier = ;
}
"))
{
MobileClassifier = ;
}
")
{
MobileClassifier = ;
}
output["Url"].Set(Url);
output["MobileClassifier"].Set(MobileClassifier);
yield return output;
}
}
}
4,对IE和bing进行union,然后对相同的query进行合并。
Scope
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history
ie =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
USING DefaultTextExtractor();
bing =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
USING DefaultTextExtractor();
union_all =
SELECT *
FROM ie
UNION ALL
SELECT *
FROM bing;
result =
SELECT Query,
Url,
SUM(Count) AS NewCount
FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
5,得到Query, Category,ClickCount的对应。
//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history
ie =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
USING DefaultTextExtractor();
bing =
EXTRACT Query : string,
Url : string,
Count : int
FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
USING DefaultTextExtractor();
union_all =
SELECT *
FROM ie
UNION ALL
SELECT *
FROM bing;
result =
SELECT Query,
Url,
SUM(Count) AS NewCount
FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";
6,得到了query->Category之后,要算那个category出现的最多。
每一条出现的地方*clickCount然后累加起来。
这里用到了reduce来做。
7,算分数。
Intern---Microsoft Academic China Team的更多相关文章
- Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud
The Microsoft Research Outreach team has worked extensively with the external research community to ...
- Team Foundation 中的错误和事件消息
Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...
- Microsoft Dynamics CRM 分销行业解决方案
Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...
- Azure China (4) 管理Azure China Storage Account
<Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...
- Azure China (1) Azure公有云落地中国
<Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...
- Microsoft server software support for Microsoft Azure virtual machines
http://support.microsoft.com/kb/2721672/en-us Article ID: 2721672 - Last Review: November 22, 2014 ...
- 如何访问Microsoft Azure Storage
首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...
- Microsoft TFS 如何显示在Windows 的上下文菜单中
How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...
- Microsoft .NET Pet Shop 简介
最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...
随机推荐
- [LeetCode] Set Matrix Zeroes 矩阵赋零
Given a m x n matrix, if an element is 0, set its entire row and column to 0. Do it in place. click ...
- Openfire 集群部署和负载均衡方案
Openfire 集群部署和负载均衡方案 一. 概述 Openfire是在即时通讯中广泛使用的XMPP协议通讯服务器,本方案采用Openfire的Hazelcast插件进行集群部署,采用Hapro ...
- Spark MLlib - LFW
val path = "/usr/data/lfw-a/*" val rdd = sc.wholeTextFiles(path) val first = rdd.first pri ...
- FCM聚类算法介绍
FCM算法是一种基于划分的聚类算法,它的思想就是使得被划分到同一簇的对象之间相似度最大,而不同簇之间的相似度最小.模糊C均值算法是普通C均值算法的改进,普通C均值算法对于数据的划分是硬性的,而FCM则 ...
- SQL基础语法(三)
SQL WHERE 子句 WHERE 子句用于规定选择的标准. WHERE 子句 如需有条件地从表中选取数据,可将 WHERE 子句添加到 SELECT 语句. 语法SELECT 列名称 FROM 表 ...
- 使用css3做钟表
<!DOCTYPE html> <html> <head> <meta charset="UTF-8"> <title> ...
- php代码基础
如何接入新浪api <?php function getWeiboData() { $count = 15; // 参数source后面输入你的授权号 $url = "https:// ...
- 1.ios synthesize有什么作用
###1.ios synthesize有什么作用 当定义了一系列的变量时,需要写很多的getter和setter方法,而且它们的形式都是差不多的,所以Xcode提供了@property和@synthe ...
- NTFS交换数据流隐写的应用
by Chesky ##目录 ####一.NTFS交换数据流(ADS)简介 ####二.ADS应用 写入隐藏文件(文本\图像\可执行文件) ADS在Windows平台下的利用--写入后门 ADS在We ...
- 火车头采集ecshop 文章接口文件
<?php header("Content-Type:text/html;charset=utf-8"); $host = $_SERVER['HTTP_HOST']; // ...