package engine

import (
    "fmt"
    "github.com/huichen/murmur"
    "github.com/huichen/sego"
    "github.com/huichen/wukong/core"
    "github.com/huichen/wukong/storage"
    "github.com/huichen/wukong/types"
    "github.com/huichen/wukong/utils"
    "log"
    "os"
    "runtime"
    "sort"
    "strconv"
    "sync/atomic"
    "time"
)

const (
    NumNanosecondsInAMillisecond = 1000000
    PersistentStorageFilePrefix  = "wukong"
)

type Engine struct {
    // 计数器,用来统计有多少文档被索引等信息
    numDocumentsIndexed      uint64
    numDocumentsRemoved      uint64
    numDocumentsForceUpdated uint64
    numIndexingRequests      uint64
    numRemovingRequests      uint64
    numForceUpdatingRequests uint64
    numTokenIndexAdded       uint64
    numDocumentsStored       uint64

    // 记录初始化参数
    initOptions types.EngineInitOptions
    initialized bool

    indexers   []core.Indexer
    rankers    []core.Ranker
    segmenter  sego.Segmenter
    stopTokens StopTokens
    dbs        []storage.Storage

    // 建立索引器使用的通信通道
    segmenterChannel         chan segmenterRequest
    indexerAddDocChannels    []chan indexerAddDocumentRequest
    indexerRemoveDocChannels []chan indexerRemoveDocRequest
    rankerAddDocChannels     []chan rankerAddDocRequest

    // 建立排序器使用的通信通道
    indexerLookupChannels   []chan indexerLookupRequest
    rankerRankChannels      []chan rankerRankRequest
    rankerRemoveDocChannels []chan rankerRemoveDocRequest

    // 建立持久存储使用的通信通道
    persistentStorageIndexDocumentChannels []chan persistentStorageIndexDocumentRequest
    persistentStorageInitChannel           chan bool
}

func (engine *Engine) Init(options types.EngineInitOptions) {
    // 将线程数设置为CPU数
    runtime.GOMAXPROCS(runtime.NumCPU())

    // 初始化初始参数
    if engine.initialized {
        log.Fatal("请勿重复初始化引擎")
    }
    options.Init()
    engine.initOptions = options
    engine.initialized = true

    if !options.NotUsingSegmenter {
        // 载入分词器词典
        engine.segmenter.LoadDictionary(options.SegmenterDictionaries)

        // 初始化停用词
        engine.stopTokens.Init(options.StopTokenFile)
    }

    // 初始化索引器和排序器
    for shard := 0; shard < options.NumShards; shard++ {
        engine.indexers = append(engine.indexers, core.Indexer{})
        engine.indexers[shard].Init(*options.IndexerInitOptions)

        engine.rankers = append(engine.rankers, core.Ranker{})
        engine.rankers[shard].Init()
    }

    // 初始化分词器通道
    engine.segmenterChannel = make(
        chan segmenterRequest, options.NumSegmenterThreads)

    // 初始化索引器通道
    engine.indexerAddDocChannels = make(
        []chan indexerAddDocumentRequest, options.NumShards)
    engine.indexerRemoveDocChannels = make(
        []chan indexerRemoveDocRequest, options.NumShards)
    engine.indexerLookupChannels = make(
        []chan indexerLookupRequest, options.NumShards)
    for shard := 0; shard < options.NumShards; shard++ {
        engine.indexerAddDocChannels[shard] = make(
            chan indexerAddDocumentRequest,
            options.IndexerBufferLength)
        engine.indexerRemoveDocChannels[shard] = make(
            chan indexerRemoveDocRequest,
            options.IndexerBufferLength)
        engine.indexerLookupChannels[shard] = make(
            chan indexerLookupRequest,
            options.IndexerBufferLength)
    }

    // 初始化排序器通道
    engine.rankerAddDocChannels = make(
        []chan rankerAddDocRequest, options.NumShards)
    engine.rankerRankChannels = make(
        []chan rankerRankRequest, options.NumShards)
    engine.rankerRemoveDocChannels = make(
        []chan rankerRemoveDocRequest, options.NumShards)
    for shard := 0; shard < options.NumShards; shard++ {
        engine.rankerAddDocChannels[shard] = make(
            chan rankerAddDocRequest,
            options.RankerBufferLength)
        engine.rankerRankChannels[shard] = make(
            chan rankerRankRequest,
            options.RankerBufferLength)
        engine.rankerRemoveDocChannels[shard] = make(
            chan rankerRemoveDocRequest,
            options.RankerBufferLength)
    }

    // 初始化持久化存储通道
    if engine.initOptions.UsePersistentStorage {
        engine.persistentStorageIndexDocumentChannels =
            make([]chan persistentStorageIndexDocumentRequest,
                engine.initOptions.PersistentStorageShards)
        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            engine.persistentStorageIndexDocumentChannels[shard] = make(
                chan persistentStorageIndexDocumentRequest)
        }
        engine.persistentStorageInitChannel = make(
            chan bool, engine.initOptions.PersistentStorageShards)
    }

    // 启动分词器
    for iThread := 0; iThread < options.NumSegmenterThreads; iThread++ {
        go engine.segmenterWorker()
    }

    // 启动索引器和排序器
    for shard := 0; shard < options.NumShards; shard++ {
        go engine.indexerAddDocumentWorker(shard)
        go engine.indexerRemoveDocWorker(shard)
        go engine.rankerAddDocWorker(shard)
        go engine.rankerRemoveDocWorker(shard)

        for i := 0; i < options.NumIndexerThreadsPerShard; i++ {
            go engine.indexerLookupWorker(shard)
        }
        for i := 0; i < options.NumRankerThreadsPerShard; i++ {
            go engine.rankerRankWorker(shard)
        }
    }

    // 启动持久化存储工作协程
    if engine.initOptions.UsePersistentStorage {
        err := os.MkdirAll(engine.initOptions.PersistentStorageFolder, 0700)
        if err != nil {
            log.Fatal("无法创建目录", engine.initOptions.PersistentStorageFolder)
        }

        // 打开或者创建数据库
        engine.dbs = make([]storage.Storage, engine.initOptions.PersistentStorageShards)
        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
            db, err := storage.OpenStorage(dbPath)
            if db == nil || err != nil {
                log.Fatal("无法打开数据库", dbPath, ": ", err)
            }
            engine.dbs[shard] = db
        }

        // 从数据库中恢复
        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            go engine.persistentStorageInitWorker(shard)
        }

        // 等待恢复完成
        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            <-engine.persistentStorageInitChannel
        }
        for {
            runtime.Gosched()
            if engine.numIndexingRequests == engine.numDocumentsIndexed {
                break
            }
        }

        // 关闭并重新打开数据库
        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            engine.dbs[shard].Close()
            dbPath := engine.initOptions.PersistentStorageFolder + "/" + PersistentStorageFilePrefix + "." + strconv.Itoa(shard)
            db, err := storage.OpenStorage(dbPath)
            if db == nil || err != nil {
                log.Fatal("无法打开数据库", dbPath, ": ", err)
            }
            engine.dbs[shard] = db
        }

        for shard := 0; shard < engine.initOptions.PersistentStorageShards; shard++ {
            go engine.persistentStorageIndexDocumentWorker(shard)
        }
    }

    atomic.AddUint64(&engine.numDocumentsStored, engine.numIndexingRequests)
}

// 将文档加入索引
//
// 输入参数:
//  docId          标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
//  data          见DocumentIndexData注释
//  forceUpdate 是否强制刷新 cache,如果设为 true,则尽快添加到索引,否则等待 cache 满之后一次全量添加
//
// 注意:
//      1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
//      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
//         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
func (engine *Engine) IndexDocument(docId uint64, data types.DocumentIndexData, forceUpdate bool) {
    engine.internalIndexDocument(docId, data, forceUpdate)

    hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
    if engine.initOptions.UsePersistentStorage && docId != 0 {
        engine.persistentStorageIndexDocumentChannels[hash] <- persistentStorageIndexDocumentRequest{docId: docId, data: data}
    }
}

func (engine *Engine) internalIndexDocument(
    docId uint64, data types.DocumentIndexData, forceUpdate bool) {
    if !engine.initialized {
        log.Fatal("必须先初始化引擎")
    }

    if docId != 0 {
        atomic.AddUint64(&engine.numIndexingRequests, 1)
    }
    if forceUpdate {
        atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
    }
    hash := murmur.Murmur3([]byte(fmt.Sprint("%d%s", docId, data.Content)))
    engine.segmenterChannel <- segmenterRequest{
        docId: docId, hash: hash, data: data, forceUpdate: forceUpdate}
}

// 将文档从索引中删除
//
// 输入参数:
//  docId          标识文档编号,必须唯一,docId == 0 表示非法文档(用于强制刷新索引),[1, +oo) 表示合法文档
//  forceUpdate 是否强制刷新 cache,如果设为 true,则尽快删除索引,否则等待 cache 满之后一次全量删除
//
// 注意:
//      1. 这个函数是线程安全的,请尽可能并发调用以提高索引速度
//      2. 这个函数调用是非同步的,也就是说在函数返回时有可能文档还没有加入索引中,因此
//         如果立刻调用Search可能无法查询到这个文档。强制刷新索引请调用FlushIndex函数。
func (engine *Engine) RemoveDocument(docId uint64, forceUpdate bool) {
    if !engine.initialized {
        log.Fatal("必须先初始化引擎")
    }

    if docId != 0 {
        atomic.AddUint64(&engine.numRemovingRequests, 1)
    }
    if forceUpdate {
        atomic.AddUint64(&engine.numForceUpdatingRequests, 1)
    }
    for shard := 0; shard < engine.initOptions.NumShards; shard++ {
        engine.indexerRemoveDocChannels[shard] <- indexerRemoveDocRequest{docId: docId, forceUpdate: forceUpdate}
        if docId == 0 {
            continue
        }
        engine.rankerRemoveDocChannels[shard] <- rankerRemoveDocRequest{docId: docId}
    }

    if engine.initOptions.UsePersistentStorage && docId != 0 {
        // 从数据库中删除
        hash := murmur.Murmur3([]byte(fmt.Sprint("%d", docId))) % uint32(engine.initOptions.PersistentStorageShards)
        go engine.persistentStorageRemoveDocumentWorker(docId, hash)
    }
}

// 查找满足搜索条件的文档,此函数线程安全
func (engine *Engine) Search(request types.SearchRequest) (output types.SearchResponse) {
    if !engine.initialized {
        log.Fatal("必须先初始化引擎")
    }

    var rankOptions types.RankOptions
    if request.RankOptions == nil {
        rankOptions = *engine.initOptions.DefaultRankOptions
    } else {
        rankOptions = *request.RankOptions
    }
    if rankOptions.ScoringCriteria == nil {
        rankOptions.ScoringCriteria = engine.initOptions.DefaultRankOptions.ScoringCriteria
    }

    // 收集关键词
    tokens := []string{}
    if request.Text != "" {
        querySegments := engine.segmenter.Segment([]byte(request.Text))
        for _, s := range querySegments {
            token := s.Token().Text()
            if !engine.stopTokens.IsStopToken(token) {
                tokens = append(tokens, s.Token().Text())
            }
        }
    } else {
        for _, t := range request.Tokens {
            tokens = append(tokens, t)
        }
    }

    // 建立排序器返回的通信通道
    rankerReturnChannel := make(
        chan rankerReturnRequest, engine.initOptions.NumShards)

    // 生成查找请求
    lookupRequest := indexerLookupRequest{
        countDocsOnly:       request.CountDocsOnly,
        tokens:              tokens,
        labels:              request.Labels,
        docIds:              request.DocIds,
        options:             rankOptions,
        rankerReturnChannel: rankerReturnChannel,
        orderless:           request.Orderless,
    }

    // 向索引器发送查找请求
    for shard := 0; shard < engine.initOptions.NumShards; shard++ {
        engine.indexerLookupChannels[shard] <- lookupRequest
    }

    // 从通信通道读取排序器的输出
    numDocs := 0
    rankOutput := types.ScoredDocuments{}
    timeout := request.Timeout
    isTimeout := false
    if timeout <= 0 {
        // 不设置超时
        for shard := 0; shard < engine.initOptions.NumShards; shard++ {
            rankerOutput := <-rankerReturnChannel
            if !request.CountDocsOnly {
                for _, doc := range rankerOutput.docs {
                    rankOutput = append(rankOutput, doc)
                }
            }
            numDocs += rankerOutput.numDocs
        }
    } else {
        // 设置超时
        deadline := time.Now().Add(time.Nanosecond * time.Duration(NumNanosecondsInAMillisecond*request.Timeout))
        for shard := 0; shard < engine.initOptions.NumShards; shard++ {
            select {
            case rankerOutput := <-rankerReturnChannel:
                if !request.CountDocsOnly {
                    for _, doc := range rankerOutput.docs {
                        rankOutput = append(rankOutput, doc)
                    }
                }
                numDocs += rankerOutput.numDocs
            case <-time.After(deadline.Sub(time.Now())):
                isTimeout = true
                break
            }
        }
    }

    // 再排序
    if !request.CountDocsOnly && !request.Orderless {
        if rankOptions.ReverseOrder {
            sort.Sort(sort.Reverse(rankOutput))
        } else {
            sort.Sort(rankOutput)
        }
    }

    // 准备输出
    output.Tokens = tokens
    // 仅当CountDocsOnly为false时才充填output.Docs
    if !request.CountDocsOnly {
        if request.Orderless {
            // 无序状态无需对Offset截断
            output.Docs = rankOutput
        } else {
            var start, end int
            if rankOptions.MaxOutputs == 0 {
                start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
                end = len(rankOutput)
            } else {
                start = utils.MinInt(rankOptions.OutputOffset, len(rankOutput))
                end = utils.MinInt(start+rankOptions.MaxOutputs, len(rankOutput))
            }
            output.Docs = rankOutput[start:end]
        }
    }
    output.NumDocs = numDocs
    output.Timeout = isTimeout
    return
}

// 阻塞等待直到所有索引添加完毕
func (engine *Engine) FlushIndex() {
    for {
        runtime.Gosched()
        if engine.numIndexingRequests == engine.numDocumentsIndexed &&
            engine.numRemovingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsRemoved &&
            (!engine.initOptions.UsePersistentStorage || engine.numIndexingRequests == engine.numDocumentsStored) {
            // 保证 CHANNEL 中 REQUESTS 全部被执行完
            break
        }
    }
    // 强制更新,保证其为最后的请求
    engine.IndexDocument(0, types.DocumentIndexData{}, true)
    for {
        runtime.Gosched()
        if engine.numForceUpdatingRequests*uint64(engine.initOptions.NumShards) == engine.numDocumentsForceUpdated {
            return
        }
    }
}

// 关闭引擎
func (engine *Engine) Close() {
    engine.FlushIndex()
    if engine.initOptions.UsePersistentStorage {
        for _, db := range engine.dbs {
            db.Close()
        }
    }
}

// 从文本hash得到要分配到的shard
func (engine *Engine) getShard(hash uint32) int {
    return int(hash - hash/uint32(engine.initOptions.NumShards)*uint32(engine.initOptions.NumShards))
}

engine.go的更多相关文章

  1. ArcGIS Engine开发之图形查询

    图形查询是以用户通过鼠标操作生成的图形几何体为输入条件进行查询的查询,其查询结果为该几何体空间范围内的所有要素.常用的查询方式包括点选.线选.多边形选择.圆形选择和矩形选择等. 相关类与接口 图像查询 ...

  2. ArcGIS Engine开发之属性查询

    属性查询即基于空间数据的属性数据的查询,通过用户提交SQL语言中的where语句定义的查询条件,对属性数据进行搜索,从而得到查询结果的操作. 相关的类与接口 与属性查询功能相关的类主要有QureyFi ...

  3. ArcGIS Engine开发之地图基本操作(4)

    ArcGIS Engine开发中数据库的加载 1.加载个人地理数据库数据 个人地理数据库(Personal Geodatabase)使用Miscrosoft Access文件(*.mdb)进行空间数据 ...

  4. ArcGIS Engine开发之地图基本操作(3)

    地图数据的加载 一.加载Shapefile数据 Shapefile文件是目前主流的一种空间数据的文件存储方式,也是不同GIS软件进行数据格式转换常用的中间格式.加载Shapefile数据的方式有两种: ...

  5. ArcGIS Engine开发之地图基本操作(2)

    地图数据的加载 1.加载地图文档 ArcGIS Engine支持加载多种类型的数据,有矢量数据的Coverage.Shapefile.dwg/dxf文件,栅格数据的BMP.GRID.控件数据库等.很多 ...

  6. ArcGIS Engine开发前基础知识(4)

    ArcGIS不同开发方式的比较 关于GIS应用软件的开发,通常有三种方式:C/S架构.网络GIS和移动GIS.ArcGIS平台提供了对三种开发方式的支持,对于采用从C/S架构的大多数开发者来讲,首先想 ...

  7. ArcGIS Engine开发前基础知识(3)

    对象模型图 一.对象模型图中的类与接口 ArcGIS Engine 提供大量的对象,这些对象之间存在各种各样的关系,如继承.组合.关联等.对象模型图(Object model diagram,ODM) ...

  8. ArcGIS Engine开发前基础知识(2)

    ArcGIS基本控件简介 ArcGIS Engine控件是一组可视化的开发组件,每个ArcGIS Engine控件都是一个COM组件.这些组件包括MapControl,PageLayoutContro ...

  9. ArcGIS Engine开发前基础知识(1)

    ArcGIS二次开发是当前gis领域的一项重要必不可少的技能.下面介绍它的基本功能 一.ArcGIS Engine功能 在使用之前首先安装和部署arcgis sdk,(在这里不在赘述相关知识)可以实现 ...

  10. 安装ArcGIS Engine 9.3

    本文仅用于学习交流,商业用途请支持正版!转载请注明:http://www.cnblogs.com/mxbs/p/6217003.html 准备: ArcGIS Engine 9.3.crack_for ...

随机推荐

  1. javascript内置对象速查(二)

    Window对象 每个浏览器窗口或框架都对应于一个Window对象,它是随body或frameset元素的每个实例一起创建的对象. function status_text(){ window.sta ...

  2. 新手入门vue 使用vue-cli创建项目

    本文是针对对于完全没有了解过vue 和npm,连运行环境和项目构建的都不会的小白,对于前端老司机的就不用看了,浪费时间. 使用npm 与vue-cli 构建vue 项目 第一步:安装运行环境(node ...

  3. 剑指offer--矩阵中的路径

    请设计一个函数,用来判断在一个矩阵中是否存在一条包含某字符串所有字符的路径.路径可以从矩阵中的任意一个格子开始,每一步可以在矩阵中向左,向右,向上,向下移动一个格子.如果一条路径经过了矩阵中的某一个格 ...

  4. VMware虚拟化培训手册

    一.VMware虚拟化架构概述 1.1VMware虚拟化架构图 如上图所示,虚拟化由物理主机(即ESXI主机).虚拟化管理程序(vCenter Server).虚拟机(操作系统).存储等基本组成. 1 ...

  5. iOS9自动布局神器StackView

    http://www.jianshu.com/p/767f72b7d754 这篇文章紧跟上边autolayout的一些小技巧,如果你没有看过,不防先看下<你真的会用autolayout吗?> ...

  6. /usr/lib/uwsgi/plugins/python_plugin.so: cannot open shared object file: No such file or directory

    Django uwsgi部署方式下产生这个Bug,后来发现把uwsgi配置ini文件里面的 #plugins = python 把上面这句配置语句注释掉,uwsgi就可以运行了,当然,是正常可用运行状 ...

  7. WebRTC MCU( Multipoint Conferencing Unit)服务器调研

    接触过的有licode.kurento. licode的缺陷:文档支持有限,licode的app client库只有js的 kurento的优势:文档齐全,Demo俱备,封装API比较齐全.它的主要特 ...

  8. 推荐 git community book 中文版

    官方地址:http://Git.seyren.com/index.html 或者 http://gitbook.liuhui998.com/ book@github项目地址: https://gith ...

  9. Java基础:JVM垃圾回收算法

    众所周知,Java的垃圾回收是不需要程序员去手动操控的,而是由JVM去完成.本文介绍JVM进行垃圾回收的各种算法. 1. 如何确定某个对象是垃圾 1.1. 引用计数法 1.2. 可达性分析 2. 典型 ...

  10. 关于loadrunner使用web_add_header添加HTTP信息头(比如Content-Type,token等)和使用

    关于loadrunner使用web_add_header添加HTTP信息头(比如Content-Type,token等)和使用 1.web_add_header添加HTTP信息头(比如Content- ...