项目二:

AEther:

项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。

0,各种关键数据统计:

数据量:1个月数据:about 1000T。

1,对IE的所有浏览搜索的提取代码:

Scope:

//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
    VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
    PARAMS
    (
        Start = @"2016-07-26",
        End = @"2016-07-26",
        Source = @"All"
//      Source = @"DesktopIE"
    );
ClickData =
    SELECT Page_FromPage.Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
           COUNT() AS Count
    FROM UnifiedViewRaw
    WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
    HAVING Count >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

    // Page_FromPage.IsQuery: True if the page is a query page
    // Vertical: Search Vertical of this PageView
    // Request_IsQuery bool: True if this page view is search engine result page

OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class CMyUtils
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class TopReducer : Reducer
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return input.Clone();
    }

    public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
    {
        ;
        foreach (Row row in input.Rows)
        {
            )
            {
                row.Copy(output);
                yield return output;
            }
        }
    }
}

2,对bing的所有浏览搜索的提取代码:

Scope:

//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";

REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";

REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;

SlapiPageView =
      VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
       //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
       //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
       PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");

ZHCNTraffic =
    SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
           Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
           Page_Entities_WebResults
    FROM SlapiPageView
    WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
          AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
          AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");

ProcessWebEntity =
    PROCESS ZHCNTraffic
    USING FEXLogSimpleExtractor;

ClickQueryUrlPairs =
    SELECT Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
           COUNT() AS PairCount
    FROM ProcessWebEntity
    WHERE Click >  HAVING PairCount >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;

public class URLUtility
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class FEXLogSimpleExtractor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Query = row["Query"].String;
            string QueryDate = row["QDate"].String;
            var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;

            ; i < WebEntities.Count;i++ )
            {
                string Url = WebEntities[i].TitleUrl;
                string Host = URLUtility.GetHost(Url);
                int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
                int Click = WebEntities[i].Clicks.Count;

                )
                {
                    output["Query"].Set(Query);
                    output["QueryDate"].Set(QueryDate);
                    output["Url"].Set(Url);
                    output["Host"].Set(Host);
                    output["POS"].Set(Pos);
                    output["Click"].Set(Click);

                    yield return output;
                }
            }
        }
    }
}

3,搜索查询和分类的提取代码:

//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING RetroIndex;

Snapshot =
    VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
    PARAMS
    (
        Sample = false,
        TierFlag =
    );

SELECT Url,
       Header,
       Body,
       HttpHeader,
       CodePage
FROM Snapshot;

Uberchunk =
    PROCESS
    PRODUCE Url,
            Country,
            Language,
            Category
    USING RetroIndexProcessor
    HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";

OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class Utility
{
    public static bool CJKVersionMobileFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if(cate.StartsWith("aa00") &&
                (cate.EndsWith("Mobi")
                || cate.EndsWith("CrossDevice")
                || cate.EndsWith("MobileFriendly")
                || cate.EndsWith("MobileUnFriendly")))
            {
                return true;
            }
        }
        return false;
    }

    public static bool CJKVersionMobileUnFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
            {
                return true;
            }
        }

        return false;
    }
}

public class CJKVersionMobileOkClassifierProcessor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Url:string, MobileClassifier:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Url = row["Url"].String;
            string Language = row["Language"].String;

            if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
            {
                continue;
            }

            //classifier features
            string Category = row["Category"].String;
            string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
            string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
            string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
            string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
            string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
            string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
            string SpamJunkRuleID = row["SpamJunkRuleID"].String;

            string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;

            ;
            if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
                || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
                || string.IsNullOrEmpty(DUV2_MobileUrl) == false
                || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
                || Utility.CJKVersionMobileFriendly(Category)
                || ("))
            {
                MobileClassifier = ;
            }
            "))
            {
                MobileClassifier = ;
            }
            ")
            {
                MobileClassifier = ;
            }

            output["Url"].Set(Url);
            output["MobileClassifier"].Set(MobileClassifier);
            yield return output;
        }
    }
}

4,对IE和bing进行union,然后对相同的query进行合并。

Scope

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

5,得到Query, Category,ClickCount的对应。

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

6,得到了query->Category之后,要算那个category出现的最多。

每一条出现的地方*clickCount然后累加起来。

这里用到了reduce来做。

7,算分数。

Intern---Microsoft Academic China Team的更多相关文章

  1. Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud

    The Microsoft Research Outreach team has worked extensively with the external research community to ...

  2. Team Foundation 中的错误和事件消息

    Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...

  3. Microsoft Dynamics CRM 分销行业解决方案

    Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...

  4. Azure China (4) 管理Azure China Storage Account

    <Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...

  5. Azure China (1) Azure公有云落地中国

    <Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...

  6. Microsoft server software support for Microsoft Azure virtual machines

    http://support.microsoft.com/kb/2721672/en-us  Article ID: 2721672 - Last Review: November 22, 2014 ...

  7. 如何访问Microsoft Azure Storage

    首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...

  8. Microsoft TFS 如何显示在Windows 的上下文菜单中

    How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...

  9. Microsoft .NET Pet Shop 简介

    最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...

随机推荐

  1. 服务端跨域处理 Cors

    1  添加 System.Web.Cors,System.Web.Http.Cors 2 global文件中 注册asp.net 管道事件 protected void Application_Beg ...

  2. Linux 安装MySQL

    安装配置 [root@iZ28gvqe4biZ ~]# rpm -Uvh http://dev.mysql.com/get/mysql-community-release-el7-5.noarch.r ...

  3. linux-crontab定时任务

    crontab命令常见于Unix和Linux的操作系统之中,用于设置周期性被执行的指令.该命令从标准输入设备读取指令,并将其存放于"crontab"文件中,以供之后读取和执行.通常 ...

  4. Scala中apply的用法

    Scala中的 apply 方法有着不同的含义, 对于函数来说该方法意味着调用function本身, 以下说明摘自Programming in Scala, 3rd Edition Every fun ...

  5. angularjs $emit $on $broadcast 父子 兄弟之间传值

    父子之间 <div ng-controller="ParentCtrl"> <div ng-controller="ChildCtrl"> ...

  6. JAVA与数据库MySQL相连接

    JDBC(Java数据库连接体系结构): 是Java实现数据库访问的应用程序编程接口,主要功能是管理存放在数据库中的数据.通过接口对象,应用程序可以完成与数据库的连接,执行SQL语句,从数据库中获取结 ...

  7. 简单的maven配置

    groupId是指com.xx 组织标识 artifactId才是项目名称 2)编译源代码 mvn compile 3)编译测试代码 mvn test-compile 4)清空 mvn clean 5 ...

  8. python学习笔记(二)

    (一)模块打包     --->        注:suba和subb文件夹下的__init__.py文件,即使为空,也必须存在 "setup.py" from distut ...

  9. 这两天遇到iphone使用app store下载免费软件,必须验证付款信息才能购物是怎么回事???

    答案: 在你这台设备上再设置一下,具体方法是:1.点设置进入2.点iTunes Store 和App Store 3.点 Apple ID ,如果没设置,设置一下,如果有的,再点击 4.出现一上选择的 ...

  10. webapi8