项目二:

AEther:

项目 一、项目需求:对搜索关键词进行类别的统计分析,为了后面的entity-rank做准备。

0,各种关键数据统计:

数据量:1个月数据:about 1000T。

1,对IE的所有浏览搜索的提取代码:

Scope:

//Script GUID:ad2766d3-7aec-4ffa-9bbd-ec2740361999
//Used for tracking history
REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";
UnifiedViewRaw =
    VIEW "/shares/searchDM/distrib/released/CompetitiveUnifiedView/CompetitiveUnifiedPageView.view"
    PARAMS
    (
        Start = @"2016-07-26",
        End = @"2016-07-26",
        Source = @"All"
//      Source = @"DesktopIE"
    );
ClickData =
    SELECT Page_FromPage.Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Request_Url)) AS NormalizedUrl,
           COUNT() AS Count
    FROM UnifiedViewRaw
    WHERE Page_FromPage.IsQuery == true AND Page_FromPage.Vertical.ToLower() == "web" AND NOT Request_IsQuery AND Page_FromPage.Market.ToLower() == "zh-cn"
    HAVING Count >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

    // Page_FromPage.IsQuery: True if the page is a query page
    // Vertical: Search Vertical of this PageView
    // Request_IsQuery bool: True if this page view is search engine result page

OUTPUT
TO @"/my/entityranker/oneDay/IEqueryclickurlpairsAll.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class CMyUtils
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class TopReducer : Reducer
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return input.Clone();
    }

    public override IEnumerable<Row> Reduce(RowSet input, Row output, string[] args)
    {
        ;
        foreach (Row row in input.Rows)
        {
            )
            {
                row.Copy(output);
                yield return output;
            }
        }
    }
}

2,对bing的所有浏览搜索的提取代码:

Scope:

//Script GUID:8e9ba2e3-8288-49e1-a5ff-7776d653ae16
//Used for tracking history
REFERENCE "/local/IndexQualityCJK/wb/WordBreaker.dll";
RESOURCE "/local/IndexQualityCJK/wb/unzip.exe";
RESOURCE "/local/IndexQualityCJK/wb/wordbreak.zip";

REFERENCE @"/shares/searchDM/distrib/released/SLAPI/SearchLogApi.dll";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Serializer.exe";
REFERENCE @"/shares/searchDM/distrib/released/SLAPI/Microsoft.Live.Json.dll";

REFERENCE "/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE "/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING MS.Internal.Bing.DataMining.SearchLogApi;
USING Microsoft.Live.Json;

SlapiPageView =
      VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogPageView.view"
       //VIEW "/shares/searchDM/distrib/released/SLAPI/SearchLogSessionView.view"
       //PARAMS (Start = @"2013-09-01", End = @"2013-09-01", UseSample=false, Dataset="Mobile");
       PARAMS (Start = "2016-07-26", End = "2016-07-26", UseSample=false, Dataset= "Mobile");

ZHCNTraffic =
    SELECT WordBreaker.BreakWords(Query_RawQuery, "zh-cn") AS Query,
           Request_RequestTime.ToString("yyyy-MM-dd") AS QDate,
           Page_Entities_WebResults
    FROM SlapiPageView
    WHERE Request_IsBotVNext == false AND Request_IsMarketingTraffic == false
          AND string.IsNullOrEmpty(Market) == false AND Market.ToLower() == "zh-cn"
          AND string.IsNullOrEmpty(Vertical) == false AND Vertical.ToLower() == "web";
//AND string.IsNullOrEmpty(FormCode) == false AND (FormCode.ToUpper() == "QBLH" OR FormCode.ToUpper() == "QBRE" OR FormCode.ToUpper() == "MSNBHP" OR FormCode.ToUpper() == "MSNFLH" OR FormCode.ToUpper() == "BCNASI");

ProcessWebEntity =
    PROCESS ZHCNTraffic
    USING FEXLogSimpleExtractor;

ClickQueryUrlPairs =
    SELECT Query,
           RetroIndex.UrlNormalizer.GetNormalizedStringNoThrow(Encoding.UTF8.GetBytes(Url)) AS NormalizedUrl,
           COUNT() AS PairCount
    FROM ProcessWebEntity
    WHERE Click >  HAVING PairCount >=  AND Query != "" AND NormalizedUrl != "" AND NormalizedUrl != null;

OUTPUT
TO @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;
using MS.Internal.Bing.DataMining.SearchLogApi;

public class URLUtility
{
    static public string NormalizeURL(string url)
    {
        url = url.ToLower();
        if (url.StartsWith("http://"))
        {
            url = url.Substring("http://".Length);
        }
        else if (url.StartsWith("https://"))
        {
            url = url.Substring("https://".Length);
        }
        if (url.StartsWith("www."))
        {
            url = url.Substring("www.".Length);
        }
        if (url.EndsWith("/"))
        {
            url = url.Substring(, url.Length - );
        }
        return url;
    }

    static public string GetHost(string url)
    {
        url = NormalizeURL(url);
        int slashPosition = url.IndexOf('/');
        )
        {
            url = url.Substring(, slashPosition);
        }
        return url;
    }
}

public class FEXLogSimpleExtractor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Query:string,QueryDate:string,Url:string,Host:string,POS:int,Click:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Query = row["Query"].String;
            string QueryDate = row["QDate"].String;
            var WebEntities = row["Page_Entities_WebResults"].Value as MS.Internal.Bing.DataMining.SearchLogApi.WebResultList;

            ; i < WebEntities.Count;i++ )
            {
                string Url = WebEntities[i].TitleUrl;
                string Host = URLUtility.GetHost(Url);
                int Pos = WebEntities[i].PositionOfEntityInTopLevelRegion;
                int Click = WebEntities[i].Clicks.Count;

                )
                {
                    output["Query"].Set(Query);
                    output["QueryDate"].Set(QueryDate);
                    output["Url"].Set(Url);
                    output["Host"].Set(Host);
                    output["POS"].Set(Pos);
                    output["Click"].Set(Click);

                    yield return output;
                }
            }
        }
    }
}

3,搜索查询和分类的提取代码:

//Script GUID:5b7abb8b-defd-4f2b-b703-9882ee6e960b
//Used for tracking history
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/RetroIndexProcessor.dll";
REFERENCE @"/shares/searchWebLoad/RetroIndex/bin/ManagedUrlNormalizer.dll";
RESOURCE @"/shares/searchWebLoad/RetroIndex/bin/NativeUrlNormalizer.dll";

USING RetroIndex;

Snapshot =
    VIEW "/shares/searchWebLoad/RetroIndex/Views/LatestSnapshot.view"
    PARAMS
    (
        Sample = false,
        TierFlag =
    );

SELECT Url,
       Header,
       Body,
       HttpHeader,
       CodePage
FROM Snapshot;

Uberchunk =
    PROCESS
    PRODUCE Url,
            Country,
            Language,
            Category
    USING RetroIndexProcessor
    HAVING string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs";

OUTPUT
TO @"/local/IndexCJK/MobileTopSites/UrlCategory.wsv";

C#:

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using ScopeRuntime;

public class Utility
{
    public static bool CJKVersionMobileFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if(cate.StartsWith("aa00") &&
                (cate.EndsWith("Mobi")
                || cate.EndsWith("CrossDevice")
                || cate.EndsWith("MobileFriendly")
                || cate.EndsWith("MobileUnFriendly")))
            {
                return true;
            }
        }
        return false;
    }

    public static bool CJKVersionMobileUnFriendly(string category)
    {
        if (string.IsNullOrEmpty(category))
        {
            return false;
        }

        string[] cates = category.Split(new string[] { "#TAB#" }, StringSplitOptions.RemoveEmptyEntries);
        foreach (string cate in cates)
        {
            if (cate.StartsWith("aa00") && cate.EndsWith("MobileUnFriendly"))
            {
                return true;
            }
        }

        return false;
    }
}

public class CJKVersionMobileOkClassifierProcessor : Processor
{
    public override Schema Produces(string[] columns, string[] args, Schema input)
    {
        return new Schema("Url:string, MobileClassifier:int");
    }

    public override IEnumerable<Row> Process(RowSet input, Row output, string[] args)
    {
        foreach (Row row in input.Rows)
        {
            string Url = row["Url"].String;
            string Language = row["Language"].String;

            if (!(string.IsNullOrEmpty(Url) == false && string.IsNullOrEmpty(Language) == false && Language.ToLower() == "zh_chs"))
            {
                continue;
            }

            //classifier features
            string Category = row["Category"].String;
            string DUPipeline_MobileUrls = row["DUPipeline_MobileUrls"].String;
            string DUPipeline_ResponsiveDesignSpans = row["DUPipeline_ResponsiveDesignSpans"].String;
            string DUV2_MobileUrl = row["DUV2_MobileUrl"].String;
            string InjHdr_MobileOkClassifier_v1 = row["InjHdr_MobileOkClassifier_v1"].String;
            string InjHdr_MobileOkX_v1 = row["InjHdr_MobileOkX_v1"].String;
            string InjHdr_MobileRedirect_V1 = row["InjHdr_MobileRedirect_V1"].String;
            string SpamJunkRuleID = row["SpamJunkRuleID"].String;

            string MobileOkClassifier = string.IsNullOrEmpty(InjHdr_MobileOkClassifier_v1) ? InjHdr_MobileOkX_v1 : InjHdr_MobileOkClassifier_v1;

            ;
            if (string.IsNullOrEmpty(DUPipeline_MobileUrls) == false
                || string.IsNullOrEmpty(DUPipeline_ResponsiveDesignSpans) == false
                || string.IsNullOrEmpty(DUV2_MobileUrl) == false
                || string.IsNullOrEmpty(InjHdr_MobileRedirect_V1) == false
                || Utility.CJKVersionMobileFriendly(Category)
                || ("))
            {
                MobileClassifier = ;
            }
            "))
            {
                MobileClassifier = ;
            }
            ")
            {
                MobileClassifier = ;
            }

            output["Url"].Set(Url);
            output["MobileClassifier"].Set(MobileClassifier);
            yield return output;
        }
    }
}

4,对IE和bing进行union,然后对相同的query进行合并。

Scope

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

5,得到Query, Category,ClickCount的对应。

//Script GUID:5a517444-605b-4f45-ae1b-4a95e6a5fc0a
//Used for tracking history

ie =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/IEqueryclickurlpairs.wsv"
    USING DefaultTextExtractor();
bing =
    EXTRACT Query : string,
            Url : string,
            Count : int
    FROM @"/my/entityranker/oneDay/bingqueryclickurlpairsTime.wsv"
    USING DefaultTextExtractor();

union_all =
    SELECT *
    FROM ie
    UNION ALL
    SELECT *
    FROM bing;

result =
    SELECT Query,
           Url,
           SUM(Count) AS NewCount
    FROM union_all ORDER BY Query;
OUTPUT result
TO @"/my/entityranker/oneDay/UnionResultSorted.wsv";

6,得到了query->Category之后,要算那个category出现的最多。

每一条出现的地方*clickCount然后累加起来。

这里用到了reduce来做。

7,算分数。

Intern---Microsoft Academic China Team的更多相关文章

  1. Announcing Microsoft Research Open Data – Datasets by Microsoft Research now available in the cloud

    The Microsoft Research Outreach team has worked extensively with the external research community to ...

  2. Team Foundation 中的错误和事件消息

    Visual Studio Team System Team Foundation 中的错误和事件消息 Team Foundation 通过显示错误消息和事件消息来通知您操作成功以及操作失败.一部分错 ...

  3. Microsoft Dynamics CRM 分销行业解决方案

    Microsoft Dynamics CRM 分销行业解决方案 方案亮点 360度动态渠道信息管理 充分的客户细分 全面的业务代表考核指标 业务代表管理和能力建设 业务代表过程管理 业务代表费用管理 ...

  4. Azure China (4) 管理Azure China Storage Account

    <Windows Azure Platform 系列文章目录> Update 2015-05-10 强烈建议使用AzCopy工具,AzCopy命令行工具,是经过优化的.高性能Azure S ...

  5. Azure China (1) Azure公有云落地中国

    <Windows Azure Platform 系列文章目录> 微软公有云Microsoft Azure已经落地中国,官方网址:http://www.windowsazure.cn/. 在 ...

  6. Microsoft server software support for Microsoft Azure virtual machines

    http://support.microsoft.com/kb/2721672/en-us  Article ID: 2721672 - Last Review: November 22, 2014 ...

  7. 如何访问Microsoft Azure Storage

    首先先要创建存储账户 http://www.cnblogs.com/SignalTips/p/4119128.html 可以通过以下的几个方式访问 通过Visual Studio 2013 Commu ...

  8. Microsoft TFS 如何显示在Windows 的上下文菜单中

    How to showing in Windows Explorer context for TFS I am not sure if this would help or you are willi ...

  9. Microsoft .NET Pet Shop 简介

    最初研究 .NET Pet Shop 的目的是用 Microsoft .NET 实现 Sun 主要的 J2EE 蓝图应用程序 Sun Java Pet Store 同样的应用程序功能. 根据用 .NE ...

随机推荐

  1. js获取可视区域高度

    document.body.clientWidth ==> BODY对象宽度 document.body.clientHeight ==> BODY对象高度 document.docume ...

  2. 使用 win+r 命令行打开我们的桌面应用(处女座的福音)

    首先新建一个文件夹,名为quickapp,然后在地址栏复制文件目录地址,进入系统高级设置,修改系统环境变量Path,双击后选择新建,输入quickapp文件目录地址,确认保存. 如何修改path变量? ...

  3. Java学习笔记(三)

    今天主要学习了ant ant概述 ant是一个将软件编译.测试.部署等步骤联系在一起加以自动化的一个工具,大多用于Java环境中的软件开发.在实际软件开发中,有很多地方可以用到ant 开发环境: Sy ...

  4. myeclipse配置maven

    1.首先配置好java的运行环境(JDK要1.7及以上版本),网上有详细资料. 2.下载maven,具体下载链接http://maven.apache.org/download.html 3.下载ap ...

  5. Mysql两个引擎对比

    Mysql两个引擎对比 MyIsam      优点:      1.支持B-Tree检索和文本全文检索      2.性能消耗方面相对较低      3.支持全表(table)锁      缺点: ...

  6. 测试或运维工作过程中最常用的几个linux命令?

     大家在测试工作过程中,可能会遇到需要你去服务器修改一些配置文件,譬如说某个字段的值是1 则关联老版本,是0则关联新版本,这时候你可能就需要会下vi的命令操作:或者查看session设置的时长,可能需 ...

  7. word中公式居中标号没有右对齐

    打开视图-标尺,调整右侧标尺就行了.

  8. JS控制,返回上一页之后强行刷新一次

    网站建设过程中,提交页面后我们经常要用到window.history.go(-1)返回上一页,因为页面的缓存功能,我们只能返回上次操作的页面,但在删除等操作中,我们希望实时看到删除项目后的页面,这就要 ...

  9. C#知识点记录

    用于记录C#知识要点. 参考:CLR via C#.C#并发编程.MSDN.百度 记录方式:读每本书,先看一遍,然后第二遍的时候,写笔记. CLR:公共语言运行时(Common Language Ru ...

  10. 安装第三方RPM仓库

    1.安装RepoForge源: CentOS 6.x [root@localhost /]# yum install http://pkgs.repoforge.org/rpmforge-releas ...