协同过滤源码路径:

~/project/javaproject/mahout-0.9/core/src $tree main/java/org/apache/mahout/cf/taste/ -L 2

main/java/org/apache/mahout/cf/taste/

├── common

│   ├── NoSuchItemException.java

│   ├── NoSuchUserException.java

│   ├── Refreshable.java

│   ├── TasteException.java

│   └── Weighting.java

├── eval

│   ├── DataModelBuilder.java

│   ├── IRStatistics.java

│   ├── RecommenderBuilder.java

│   ├── RecommenderEvaluator.java

│   ├── RecommenderIRStatsEvaluator.java

│   └── RelevantItemsDataSplitter.java

├── hadoop

│   ├── EntityEntityWritable.java

│   ├── EntityPrefWritable.java

│   ├── MutableRecommendedItem.java

│   ├── RecommendedItemsWritable.java

│   ├── TasteHadoopUtils.java

│   ├── ToEntityPrefsMapper.java

│   ├── ToItemPrefsMapper.java

│   ├── TopItemsQueue.java

│   ├── als

│   ├── item

│   ├── preparation

│   └── similarity

├── impl

│   ├── common

│   ├── eval

│   ├── model

│   ├── neighborhood

│   ├── recommender

│   └── similarity

├── model

│   ├── DataModel.java

│   ├── IDMigrator.java

│   ├── JDBCDataModel.java

│   ├── Preference.java

│   ├── PreferenceArray.java

│   └── UpdatableIDMigrator.java

├── neighborhood

│   └── UserNeighborhood.java

├── recommender

│   ├── CandidateItemsStrategy.java

│   ├── IDRescorer.java

│   ├── ItemBasedRecommender.java

│   ├── MostSimilarItemsCandidateItemsStrategy.java

│   ├── RecommendedItem.java

│   ├── Recommender.java

│   ├── Rescorer.java

│   └── UserBasedRecommender.java

└── similarity

├── ItemSimilarity.java

├── PreferenceInferrer.java

├── UserSimilarity.java

└── precompute

similarity  相似度的interface定义

recommender 推荐算法的interface定义

model  数据model类型的interface定义

impl 目录 则是以上interface定义的实现

PearsonCorrelationSimilarity的实现在

~/mahout-core/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java

/**
* @throws IllegalArgumentException if {@link DataModel} does not have preference values
*/
public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException {
//这里CenterData传的时true
/* pearson其实做的事情就是先把两个向量都减去他们的平均值,然后再计算cosine值。
* 在 AbstractSimilarity里的实现代码如下:
* double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);

} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
}
*/
super(dataModel, weighting, true);
Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values");
} @Override
double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) {
if (n == 0) {
return Double.NaN;
}
// Note that sum of X and sum of Y don't appear here since they are assumed to be 0;
// the data is assumed to be centered.
double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2);
if (denominator == 0.0) {
// One or both parties has -all- the same ratings;
// can't really say much similarity under this measure
return Double.NaN;
}
return sumXY / denominator;
}

就是数学公式的实现:

具体的累加,在interface里面已经做了,:

@Override
public double userSimilarity(long userID1, long userID2) throws TasteException {
DataModel dataModel = getDataModel();
  //获取用户偏好
PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1);
PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2);
int xLength = xPrefs.length();
int yLength = yPrefs.length(); if (xLength == 0 || yLength == 0) {
return Double.NaN;
} long xIndex = xPrefs.getItemID(0);
long yIndex = yPrefs.getItemID(0);
int xPrefIndex = 0;
int yPrefIndex = 0; double sumX = 0.0;
double sumX2 = 0.0;
double sumY = 0.0;
double sumY2 = 0.0;
double sumXY = 0.0;
double sumXYdiff2 = 0.0;
int count = 0; boolean hasInferrer = inferrer != null; while (true) {
int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0;
if (hasInferrer || compare == 0) {
double x;
double y;
if (xIndex == yIndex) {
// Both users expressed a preference for the item
x = xPrefs.getValue(xPrefIndex);
y = yPrefs.getValue(yPrefIndex);
} else {
//如果不存在对应的分数,则进行推断...
// Only one user expressed a preference, but infer the other one's preference and tally
// as if the other user expressed that preference
if (compare < 0) {
// X has a value; infer Y's
x = xPrefs.getValue(xPrefIndex);
y = inferrer.inferPreference(userID2, xIndex);
} else {
// compare > 0
// Y has a value; infer X's
x = inferrer.inferPreference(userID1, yIndex);
y = yPrefs.getValue(yPrefIndex);
}
}
sumXY += x * y;
sumX += x;
sumX2 += x * x;
sumY += y;
sumY2 += y * y;
double diff = x - y;
sumXYdiff2 += diff * diff;
count++;
}
if (compare <= 0) {
if (++xPrefIndex >= xLength) {
if (hasInferrer) {
// Must count other Ys; pretend next X is far away
if (yIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
xIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
xIndex = xPrefs.getItemID(xPrefIndex);
}
}
if (compare >= 0) {
if (++yPrefIndex >= yLength) {
if (hasInferrer) {
// Must count other Xs; pretend next Y is far away
if (xIndex == Long.MAX_VALUE) {
// ... but stop if both are done!
break;
}
yIndex = Long.MAX_VALUE;
} else {
break;
}
} else {
yIndex = yPrefs.getItemID(yPrefIndex);
}
}
} // "Center" the data. If my math is correct, this'll do it.
double result;
if (centerData) {
double meanX = sumX / count;
double meanY = sumY / count;
// double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY;
double centeredSumXY = sumXY - meanY * sumX;
// double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX;
double centeredSumX2 = sumX2 - meanX * sumX;
// double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY;
double centeredSumY2 = sumY2 - meanY * sumY;
result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2);
} else {
result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2);
} if (!Double.isNaN(result)) {
result = normalizeWeightResult(result, count, cachedNumItems);
}
return result;
}

参考:

http://blog.csdn.net/v_july_v/article/details/7184318

http://blog.sina.com.cn/s/blog_73de143c010153vp.html

Apache mahout 源码阅读笔记--协同过滤, PearsonCorrelationSimilarity的更多相关文章

  1. Apache mahout 源码阅读笔记-DataModel之UserBaseRecommender

    先来看一下使用流程: 1)拿到DataModel 2)定义相似度计算模型 PearsonCorrelationSimilarity 3)定义用户邻域计算模型 NearestNUserNeighborh ...

  2. Apache mahout 源码阅读笔记--DataModel之FileDataModel

    要做推荐,用户行为数据是基础. 用户行为数据有哪些字段呢? mahout的DataModel支持,用户ID,ItemID是必须的,偏好值(用户对当前Item的评分),时间戳 这四个字段 {@code ...

  3. Apache Storm源码阅读笔记

    欢迎转载,转载请注明出处. 楔子 自从建了Spark交流的QQ群之后,热情加入的同学不少,大家不仅对Spark很热衷对于Storm也是充满好奇.大家都提到一个问题就是有关storm内部实现机理的资料比 ...

  4. CI框架源码阅读笔记3 全局函数Common.php

    从本篇开始,将深入CI框架的内部,一步步去探索这个框架的实现.结构和设计. Common.php文件定义了一系列的全局函数(一般来说,全局函数具有最高的加载优先权,因此大多数的框架中BootStrap ...

  5. Mina源码阅读笔记(四)—Mina的连接IoConnector2

    接着Mina源码阅读笔记(四)-Mina的连接IoConnector1,,我们继续: AbstractIoAcceptor: 001 package org.apache.mina.core.rewr ...

  6. CI框架源码阅读笔记5 基准测试 BenchMark.php

    上一篇博客(CI框架源码阅读笔记4 引导文件CodeIgniter.php)中,我们已经看到:CI中核心流程的核心功能都是由不同的组件来完成的.这些组件类似于一个一个单独的模块,不同的模块完成不同的功 ...

  7. CI框架源码阅读笔记4 引导文件CodeIgniter.php

    到了这里,终于进入CI框架的核心了.既然是“引导”文件,那么就是对用户的请求.参数等做相应的导向,让用户请求和数据流按照正确的线路各就各位.例如,用户的请求url: http://you.host.c ...

  8. CI框架源码阅读笔记2 一切的入口 index.php

    上一节(CI框架源码阅读笔记1 - 环境准备.基本术语和框架流程)中,我们提到了CI框架的基本流程,这里再次贴出流程图,以备参考: 作为CI框架的入口文件,源码阅读,自然由此开始.在源码阅读的过程中, ...

  9. 源码阅读笔记 - 1 MSVC2015中的std::sort

    大约寒假开始的时候我就已经把std::sort的源码阅读完毕并理解其中的做法了,到了寒假结尾,姑且把它写出来 这是我的第一篇源码阅读笔记,以后会发更多的,包括算法和库实现,源码会按照我自己的代码风格格 ...

随机推荐

  1. sama5d3 环境检测 gpio--yx测试

    说明: yx0--pioA0 yx1--pioA2  yx2--pioA4  yx3--pioA10  yx4--pioA14  yx5--pioA16 yx6--pioA12 yx7--pioA20 ...

  2. 大数据(11) - kafka的安装与使用

    一.Kafka概述 1.Kafka是什么 在流式计算中,Kafka一般用来缓存数据,Storm通过消费Kafka的数据进行计算. 1)Apache Kafka是一个开源消息系统,由Scala写成.是由 ...

  3. LVM简介

    3. 创建VG.. 7 4. 创建LV.. 9 5.LV格式化及挂载... 10 一.LVM简介 LVM是 Logical Volume Manager(逻辑卷管理)的简写,它由Heinz Mauel ...

  4. PostgreSQL视频去重 图片去重系列1

    PostgreSQL 在视频.图片去重,图像搜索业务中的应用 图片搜索 PostgreSQL的图像搜索插件使用了非常主流的Haar wavelet技术对图像进行变换后存储 gist 索引方法(支持pa ...

  5. 02 观察 mysql 周期性变化

    ()首先写一个shell 脚本 vim mysql_status.sh 脚本如下: #!bin/bash while true do mysqladmin -urooy ext|awk '/Queri ...

  6. 【BZOJ】1628 && 1683: [Usaco2007 Demo]City skyline 城市地平线(单调栈)

    http://www.lydsy.com/JudgeOnline/problem.php?id=1628 http://www.lydsy.com/JudgeOnline/problem.php?id ...

  7. web版pdf在线阅读器

    近期论坛里有人提到了,在线pdf阅读器怎么做.我百度了一下,发现非常多人都非常懒.程序猿都非常懒吗? 答案是否定的. 为什么都不愿意去搜索一下呢.网上非常多答案的.我这里就列举一例.大家共勉. 看代码 ...

  8. 说说M451例程之PWM

    /**************************************************************************//** * @file main.c * @ve ...

  9. iOS-多线程的底层实现

    (1)首先回答什么是线程 1个进程要想执行任务,必须得有线程.线程是进程的基本执行单元,一个进程(程序)的所有任务都在线程中执行 (2)什么是多线程 1个进程中可以开启多条线程,每条线程可以并行(同时 ...

  10. WEB安全番外第四篇--关于SQL盲注

    一.SQL盲注: 看不到回显的,无法从返回直接读取到数据库内容的对数据的猜解,属于盲注. 二.第一种--基于布尔类型的盲注: 这种很简单,最典型的例子,就是挖SQL注入的时候常用的: ''' http ...