阿里电话面试问题----100万个URL怎样找到出现频率最高的前100个？

内推阿里电话面试中面试官给我出的一个题：

我想的头一个解决方式。就是放到stl 的map里面对出现的频率作为pair的第二个字段进行排序。之后依照排序结果返回：

以下口说无凭，show your code，当然在讨论帖子中遭遇了project界大牛的sql代码在技术上的碾压。什么是做project的，什么是project师的思维。不要一味的埋头搞算法。

讨论帖：

http://bbs.csdn.net/topics/391080906

python 抓取百度搜索结果的讨论贴：

http://bbs.csdn.net/topics/391077668

实验数据。python从百度抓得：

# -*- coding: utf-8 -*-

"""

Spyder Editor

This is a temporary script file.

"""

import urllib2

import re

import os

#connect to a URL

#一页的搜索结果中url大概是200个左右

file_url = open('url.txt','ab+')

#搜索框里的东西,这块能够设置成数字好让每次搜索的结果不一样

search = '123'

url = "http://www.baidu.com/s?

wd="+search

def setUrlToFile():

    website = urllib2.urlopen(url)

    #read html code 

    html = website.read() 

    #use re.findall to get all the links 

    links = re.findall('"((http|ftp)s?

://.*?)"', html)

    for s in links:

        print s[0]

        if len(s[0]) < 256:

            file_url.write(s[0]+'\r\n')

#收集实验数据

for i in range(0,50):

    setUrlToFile()

file_url.close()

###须要又一次打开再读一下

file_url = open('url.txt','r')

file_lines = len(file_url.readlines())

print "there are %d url in %s" %(file_lines,file_url)

file_url.close()

方法1：

c++ 写的读 url.txt放到map里面

对map<string , int>的value进行排序，得到前100个

执行一下也就55s，还是非常快的。url长度进行了限制小于256个字符

#pragma once

/*

//计算代码段执行时间的类

//

*/

#include <iostream>

#ifndef ComputeTime_h

#define ComputeTime_h

//单位毫秒

class   ComputeTime

{

private:

	int Initialized;

	__int64 Frequency;

	__int64 BeginTime;  

public:  

	bool Avaliable();

	double End();

	bool Begin();

	ComputeTime();

	virtual   ~ComputeTime();    

};  

#endif

#include "stdafx.h"

#include "ComputeTime.h"

#include <iostream>

#include <Windows.h>

ComputeTime::ComputeTime()

{

	Initialized=QueryPerformanceFrequency((LARGE_INTEGER   *)&Frequency);

}  

 ComputeTime::~ComputeTime()

{  

}  

 bool   ComputeTime::Begin()

{

	if(!Initialized)

		return 0;

	 return   QueryPerformanceCounter((LARGE_INTEGER   *)&BeginTime);

 }

 double   ComputeTime::End()

{

	 if(!Initialized)

		return 0;

	 __int64   endtime;  

	 QueryPerformanceCounter((LARGE_INTEGER   *)&endtime);  

	 __int64   elapsed = endtime-BeginTime;  

	 return   ((double)elapsed/(double)Frequency)*1000.0;  //单位毫秒

 }  

 bool   ComputeTime::Avaliable()

{

	 return Initialized;

}   

// sortUrl.cpp : 定义控制台应用程序的入口点。

//

#include "stdafx.h"

//#include <utility>

#include <vector>

#include <map>

#include <fstream>

#include <iostream>

#include <string>

#include <algorithm>

#include "ComputeTime.h"

using namespace std;

map<string,int> urlfrequency;

typedef pair<string, int> PAIR;

struct CmpByValue

{

	bool operator()(const PAIR& lhs, const PAIR& rhs)

	{

		return lhs.second > rhs.second;

	}

};

void find_largeTH(map<string,int> urlfrequency)

{

	//把map中元素转存到vector中 ,依照value排序

	vector<PAIR> url_quency_vec(urlfrequency.begin(), urlfrequency.end());

	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());

	//url_quency_vec.size()

	for (int i = 0; i != 100; ++i)

	{

		cout<<url_quency_vec[i].first<<endl;

		cout<<url_quency_vec[i].second<<endl;

	}

}

//urlheap的建立过程，URL插入时候存在的

void insertUrl(string url)

{

	pair<map<string ,int>::iterator, bool> Insert_Pair;

	Insert_Pair = urlfrequency.insert(map<string, int>::value_type(url,1));

	if (Insert_Pair.second == false)

	{

		(Insert_Pair.first->second++);

	}

}

int _tmain(int argc, _TCHAR* argv[])

{

	fstream URLfile;

	char buffer[1024];

	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())

	{ cout << "Error opening file"; exit (1); }

	else

	{

	cout<<"open file success!"<<endl;

	}

	ComputeTime cp;

	cp.Begin();

	int i = 0;

	 while (!URLfile.eof())

	{

	URLfile.getline (buffer,1024);

	//cout << buffer << endl;

	string temp(buffer);

	//cout<<i++<<endl;

	insertUrl(temp);

	}  

	find_largeTH(urlfrequency);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();

	//system("pause");

	return 0;

}

实验结果：55s还不算太差。能够接受，毕竟是头脑中的第一个解决方式。

方法2：

hash code 版本号。仅仅是不知道怎么 hash和url关联起来：

// urlFind.cpp : 定义控制台应用程序的入口点。

//

// sortUrl.cpp : 定义控制台应用程序的入口点。

//

#include "stdafx.h"

#include <vector>

#include <map>

#include <fstream>

#include <iostream>

#include <string>

#include <algorithm>

#include <unordered_map>

#include "ComputeTime.h"

using namespace std;

map<unsigned int,int> urlhash;

typedef pair<unsigned int, int> PAIR;

struct info{

	string url;

	int cnt;

	bool operator<(const info &r) const {

		return cnt>r.cnt;

	}

};

unordered_map<string,int> count;

//priority_queue<info> pq;

struct CmpByValue

{

	bool operator()(const PAIR& lhs, const PAIR& rhs)

	{

		return lhs.second > rhs.second;

	}

};

void find_largeTH(map<unsigned int,int> urlhash)

{

	//把map中元素转存到vector中 ,依照value排序

	vector<PAIR> url_quency_vec(urlhash.begin(), urlhash.end());

	sort(url_quency_vec.begin(), url_quency_vec.end(), CmpByValue());

	//url_quency_vec.size()

	for (int i = 0; i != 100; ++i)

	{

		cout<<url_quency_vec[i].first<<endl;

		cout<<url_quency_vec[i].second<<endl;

	}

}

// BKDR Hash Function

unsigned int BKDRHash(char *str)

{

	unsigned int seed = 131; // 31 131 1313 13131 131313 etc..

	unsigned int hash = 0;

	while (*str)

	{

		hash = hash * seed + (*str++);

	}

	return (hash & 0x7FFFFFFF);

}

//

void insertUrl(string url)

{

	unsigned int hashvalue = BKDRHash((char *)url.c_str());

	pair<map<unsigned int ,int>::iterator, bool> Insert_Pair;

	Insert_Pair = urlhash.insert(map<unsigned int, int>::value_type(hashvalue,1));

	if (Insert_Pair.second == false)

	{

		(Insert_Pair.first->second++);

	}

}

int _tmain(int argc, _TCHAR* argv[])

{

	fstream URLfile;

	char buffer[1024];

	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())

	{ cout << "Error opening file"; exit (1); }

	else

	{

		cout<<"open file success!"<<endl;

	}

	ComputeTime cp;

	cp.Begin();

	int i = 0;

	while (!URLfile.eof())

	{

		URLfile.getline (buffer,1024);

		//cout << buffer << endl;

		string temp(buffer);

		//cout<<i++<<endl;

		insertUrl(temp);

	}  

	find_largeTH(urlhash);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();

	//system("pause");

	return 0;

}

性能15秒左右：缺点在于没有把hashcode和url进行关联，技术的处理速度已经很可观了

watermark/2/text/aHR0cDovL2Jsb2cuY3Nkbi5uZXQv/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70/gravity/Center">

方法3：

以下用STL的hash容器unordered_map。和优先队列(就是堆)来实现这个问题。

// urlFind.cpp : 定义控制台应用程序的入口点。

//

// sortUrl.cpp : 定义控制台应用程序的入口点。

//

#include "stdafx.h"

#include <vector>

#include <map>

#include <fstream>

#include <iostream>

#include <string>

#include <algorithm>

#include <unordered_map>

#include <queue>

#include "ComputeTime.h"

using namespace std;

typedef pair<string, int> PAIR;

struct info

{

	string url;

	int cnt;

	bool operator<(const info &r) const

	{

		return cnt<r.cnt;

	}

};

unordered_map<string,int> hash_url;

priority_queue<info> pq;

void find_largeTH(unordered_map<string,int> urlhash)

{

	unordered_map<string,int>::iterator iter = urlhash.begin();

	info temp;

	for (; iter!= urlhash.end();++iter)

	{

		temp.url = iter->first;

		temp.cnt = iter->second;

		pq.push(temp);

	}

	for (int i = 0; i != 100; ++i)

	{

		cout<<pq.top().url<<endl;

		cout<<pq.top().cnt<<endl;

		pq.pop();

	}

}

void insertUrl(string url)

{

	pair<unordered_map<string ,int>::iterator, bool> Insert_Pair;

	Insert_Pair = hash_url.insert(unordered_map<string, int>::value_type(url,1));

	if (Insert_Pair.second == false)

	{

		(Insert_Pair.first->second++);

	}

}

int _tmain(int argc, _TCHAR* argv[])

{

	fstream URLfile;

	char buffer[1024];

	URLfile.open("url.txt",ios::in|ios::out|ios::binary);

	if (! URLfile.is_open())

	{ cout << "Error opening file"; exit (1); }

	else

	{

		cout<<"open file success!"<<endl;

	}

	ComputeTime cp;

	cp.Begin();

	int i = 0;

	while (!URLfile.eof())

	{

		URLfile.getline (buffer,1024);

		//cout << buffer << endl;

		string temp(buffer);

		//cout<<i++<<endl;

		insertUrl(temp);

	}  

	find_largeTH(hash_url);

	cout<<"running time: "<<cp.End()<<"ms"<<endl;

	getchar();

	//system("pause");

	return 0;

}

基本上算是算法里面比較优秀的解决方式了，面试官假设能听到这个方法应该会比較欣喜。

方法4：实验耗时未知，技术上碾压了上述解决方式，中高年轻人。不要反复造轮子。哈哈

数据库，SQL语句：

load data infile "d:/bigdata.txt" into table tb_url(url);

SELECT

	url,

	count(url) as show_count

	FROM

	tb_url

	GROUP BY url

	ORDER BY show_count desc

	LIMIT 100

阿里电话面试问题----100万个URL怎样找到出现频率最高的前100个？的更多相关文章

阿里电话面试问题----100万个URL如何找到出现频率最高的前100个？
内推阿里电话面试中面试官给我出的一个题: 我想的头一个解决方案,就是放到stl 的map里面对出现的频率作为pair的第二个字段进行排序,之后按照排序结果返回: 下面口说无凭,show your co ...
SQL 从100万条记录中的到成绩最高的记录
从100万条记录中的到成绩最高的记录问题分析:要从一张表中找到成绩最高的记录并不难,有很多种办法,最简单的就是利用TOP 1 select top 1 * from student order b ...
阿里Java开发电话面试经历--惨败
近期准备跳槽,想试试知名大企业--阿里.经过boss直聘上一些内部人员的内推,有幸获得了一次电话面试的机会.(虽然在面试开始之前就大概知道结果是如何,但是也总得试试自己个有多水,哈哈哈...) 跟大家 ...
阿里P8面试官：如何设计一个扛住千万级并发的架构？
大家先思考一个问题,这也是在面试过程中经常遇到的问题. 如果你们公司现在的产品能够支持10W用户访问,你们老板突然和你说,融到钱了,会大量投放广告,预计在1个月后用户量会达到1000W,如果这个任务交 ...
Java 最常用类（前100名）来自一万个开源项目
大部分的 Java 软件开发都会使用到各种不同的库.近日我们从一万个开源的 Java 项目中进行分析,从中提取出最常用的 Java 类,这些类有来自于 Java 的标准库,也有第三方库.每个类在同一个 ...
上海支付宝终面后等了两周,没能收到offer却来了杭州淘宝的电话面试
上上周一(14/12/22)上海支付宝hr终面 http://www.cnblogs.com/zhanghaoh/p/4178386.html 苦苦等了两周,没能如愿收到offer,却在今天等来了杭 ...
电话面试问答Top 50 --[伯乐在线]
今年是2015年,在过去几年中,电面(电话面试)是筛选程序员职位候选人的最流行的方式.它让雇佣双方很容易互相了解对方,候选人不需要去未来雇主的所在地,面试官也不用做额外的安排.这是我介绍程序员面试问题 ...
100万套PPT模板，包含全宇宙所有主题类型PPT，绕宇宙100圈，持续更新
100万套PPT模板,包含全宇宙所有主题类型PPT(全部免费,都是精品,没有一张垃圾不好看的PPT,任何一张PPT拿来套入自己的信息就可以立马使用),绕宇宙100圈,任意一个模板在某文库上都价不菲.强 ...
如何通过Dataphin构建数据中台新增100万用户？
欢迎来到数据中台小讲堂!这一期我们来看看,作为阿里巴巴数据中台(OneData - OneModel.OneID.OneService)方法论的产品载体,Dataphin如何帮助传统零售企业实现数字化 ...

随机推荐

How To:利用frm和idb文件进行数据恢复.txt
在另外一个机器上准备测试数据,并传输到dbadb05机器的/mysql/backup/reco/位置下.开始尝试恢复数据一.使用mysqlfrm获取表结构信息及DDL语句. [mysql@dbadb0 ...
微信小程序UI组件库 iView Weapp快速上手
概述今天在网上突然看到iView新出了一个微信小程序的组件库iView Weapp,自己就上手试了一下,发现用起来还是不错的,把自己使用的过程与大家分享下. 一预览iView组件 1.可以在手机上 ...
笔试算法题（08）：输出倒数第K个节点
出题:输入一个单向链表,要求输出链表中倒数第K个节点分析:利用等差指针,指针A先行K步,然后指针B从链表头与A同步前进,当A到达链表尾时B指向的节点就是倒数第K个节点: 解题: struct Nod ...
mysql批量插值
将查询结果集插入到表中(适用批量插值) 将结果集插入不需要添加VALUES INSERT INTO `erp`.`role_menu` (`ROLEUUID`, `MENUUUID`) (SELEC ...
如何系统学习并且掌握JavaScript
django+uwsgi+nginx部署(非常详细)
django+uwsgi+nginx部署 1.介绍: 在网上看了很多教程,但自己部署了很久都没有成功,这篇博文记录自己所踩过得坑. 2.环境: Ubuntu 16.04.1 LTS (GNU/Linu ...
【BZOJ 2118】墨墨的等式（Dijkstra）
BZOJ2118 墨墨的等式题链:http://www.lydsy.com/JudgeOnline/problem.php?id=2118 Description 墨墨突然对等式很感兴趣,他正在研究 ...
19-看图理解数据结构与算法系列(Radix树)
Radix树 Radix树,即基数树,也称压缩前缀树,是一种提供key-value存储查找的数据结构.与Trie不同的是,它对Trie树进行了空间优化,只有一个子节点的中间节点将被压缩.同样的,Rad ...
Qt 安装与配置记录
一安装的时候得选一个Qt安装啊!!不要忘了展开这一项,而只安装Qt creator 展开之后会发现有很多版本,为了方便,选自带编译器mingw,就不需要麻烦的配置了二打开Qt creator 后 ...
[NOIP2004] 提高组洛谷P1091 合唱队形
题目描述 N位同学站成一排,音乐老师要请其中的(N-K)位同学出列,使得剩下的K位同学排成合唱队形. 合唱队形是指这样的一种队形:设K位同学从左到右依次编号为1,2…,K,他们的身高分别为T1,T2, ...

阿里电话面试问题----100万个URL怎样找到出现频率最高的前100个？

阿里电话面试问题----100万个URL怎样找到出现频率最高的前100个？的更多相关文章

随机推荐

热门专题