项目二:

AEther:

项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。

0,各种关键数据统计:

数据量:1个月数据:about 1000T。

1,对IE的所有浏览搜索的提取代码:

Scope:

//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
    VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
    PARAMS
    (
        Start = @"2016-07-26",
        End = @"2016-07-26",
        Source = @"All"
//      Source = @"DesktopIE"
    );
ClickData =
    SELECT Page_FromPage.Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
           COUNT() AS Count
    FROM UnifiedViewRaw
    WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
    HAVING Count >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

    // Page_FromPage.IsQuery: True if the page is a query page
    // Vertical: Search Vertical of this PageView
    // Request_IsQuery bool: True if this page view is search engine result page

OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class CMyUtils
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class TopReducer : Reducer
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return input.Clone();
    }

    public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
    {
        ;
        foreach (Row row in input.Rows)
        {
            )
            {
                row.Copy(output);
                yield return output;
            }
        }
    }
}

2,对bing的所有浏览搜索的提取代码:

Scope:

//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";

REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";

REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;

SlapiPageView =
      VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
       //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
       //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
       PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");

ZHCNTraffic =
    SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
           Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
           Page_Entities_WebResults
    FROM SlapiPageView
    WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
          AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
          AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");

ProcessWebEntity =
    PROCESS ZHCNTraffic
    USING FEXLogSimpleExtractor;

ClickQueryUrlPairs =
    SELECT Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
           COUNT() AS PairCount
    FROM ProcessWebEntity
    WHERE Click >  HAVING PairCount >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;

public class URLUtility
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class FEXLogSimpleExtractor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Query = row["Query"].String;
            string QueryDate = row["QDate"].String;
            var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;

            ; i < WebEntities.Count;i++ )
            {
                string Url = WebEntities[i].TitleUrl;
                string Host = URLUtility.GetHost(Url);
                int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
                int Click = WebEntities[i].Clicks.Count;

                )
                {
                    output["Query"].Set(Query);
                    output["QueryDate"].Set(QueryDate);
                    output["Url"].Set(Url);
                    output["Host"].Set(Host);
                    output["POS"].Set(Pos);
                    output["Click"].Set(Click);

                    yield return output;
                }
            }
        }
    }
}

3,搜索查询和分类的提取代码:

//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING RetroIndex;

Snapshot =
    VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
    PARAMS
    (
        Sample = false,
        TierFlag =
    );

SELECT Url,
       Header,
       Body,
       HttpHeader,
       CodePage
FROM Snapshot;

Uberchunk =
    PROCESS
    PRODUCE Url,
            Country,
            Language,
            Category
    USING RetroIndexProcessor
    HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";

OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class Utility
{
    public static bool CJKVersionMobileFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if(cate.StartsWith("aa00") &&
                (cate.EndsWith("Mobi")
                || cate.EndsWith("CrossDevice")
                || cate.EndsWith("MobileFriendly")
                || cate.EndsWith("MobileUnFriendly")))
            {
                return true;
            }
        }
        return false;
    }

    public static bool CJKVersionMobileUnFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
            {
                return true;
            }
        }

        return false;
    }
}

public class CJKVersionMobileOkClassifierProcessor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Url:string, MobileClassifier:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Url = row["Url"].String;
            string Language = row["Language"].String;

            if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
            {
                continue;
            }

            //classifier features
            string Category = row["Category"].String;
            string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
            string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
            string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
            string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
            string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
            string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
            string SpamJunkRuleID = row["SpamJunkRuleID"].String;

            string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;

            ;
            if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
                || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
                || string.IsNullOrEmpty(DUV2_MobileUrl) == false
                || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
                || Utility.CJKVersionMobileFriendly(Category)
                || ("))
            {
                MobileClassifier = ;
            }
            "))
            {
                MobileClassifier = ;
            }
            ")
            {
                MobileClassifier = ;
            }

            output["Url"].Set(Url);
            output["MobileClassifier"].Set(MobileClassifier);
            yield return output;
        }
    }
}

4,对IE和bing进行union,然后对相同的query进行合并。

Scope

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

5,得到Query, Category,ClickCount的对应。

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

6,得到了query->Category之后,要算那个category出现的最多。

每一条出现的地方*clickCount然后累加起来。

这里用到了reduce来做。

7,算分数。

Intern---Microsoft Academic China Team的更多相关文章

  1. Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud

    The Microsoft Research Outreach team has worked extensively with the external research community to ...

  2. Team Foundation 中的错误和事件消息

    Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...

  3. Microsoft Dynamics CRM 分销行业解决方案

    Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...

  4. Azure China (4) 管理Azure China Storage Account

    <Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...

  5. Azure China (1) Azure公有云落地中国

    <Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...

  6. Microsoft server software support for Microsoft Azure virtual machines

    http://support.microsoft.com/kb/2721672/en-us  Article ID: 2721672 - Last Review: November 22, 2014 ...

  7. 如何访问Microsoft Azure Storage

    首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...

  8. Microsoft TFS 如何显示在Windows 的上下文菜单中

    How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...

  9. Microsoft .NET Pet Shop 简介

    最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...

随机推荐

  1. [LeetCode] Rotate List 旋转链表

    Given a list, rotate the list to the right by k places, where k is non-negative. For example:Given 1 ...

  2. Ubuntu下git的安装与使用

    Ubuntu下git的安装与使用 Ubuntu下git的安装与使用与Windows下的大致相同,只不过个人感觉在Ubuntu下使用git更方便. 首先,确认你的系统是否已安装git,可以通过git指令 ...

  3. [开源].NET高性能框架Chloe.ORM-完美支持SQLite

    扯淡 这是一款轻量.高效的.NET C#数据库访问框架(ORM).查询接口借鉴 Linq(但不支持 Linq).借助 lambda 表达式,可以完全用面向对象的方式就能轻松执行多表连接查询.分组查询. ...

  4. ACM模板(持续补完)

    1.KMP #include<cstring> #include<algorithm> #include<cstdio> using namespace std; ...

  5. RabbitMQ总结概念

    AMQP:一个提供统一消息服务的应用层标准高级消息队列协议,是应用层协议的一个开放标准,为面向消息的中间件设计 http://www.diggerplus.org/archives/3110 AMQP ...

  6. 用EmEditor实现PDF转Word后的对齐排版

    Redraw = false//禁止重绘(类似于VBA中的: Application.screenupdating=FALSE),以提高运行效率 //去除所有空行和只由空白字符构成的行 documen ...

  7. 测试对于list的sort与sorted的效率

    sorted from time import clock from random import randint start = clock() a = [randint(0,1000000) for ...

  8. caffe代码调试小结

    RELULayer层 bottom[0]->count=n*c*w*h=50*96*56*56 count=50*96*56*56,根据bottom_data[i]访问所有的数据(多维数组都是一 ...

  9. c# json总结

    json确实很好用,但是网上写的很多都很复杂,不适合自己,然后每次写了又记不住,又要反复找,所以将其中的代码写下来.以后看这个就可以了 都引用了 Newtonsoft.Json 1.ashx,asmx ...

  10. Word2016(2013)怎么从任意页插入起始页码

    添加页码   1 小知识:双击任意页的页脚或页眉区域,激活页眉页脚设计界面.双击文档正文区域,返回正文编辑界面. 2 双击任意页的页脚或页眉区域,激活页眉页脚设计界面.单击"页码" ...