C++ 大规模数据排序(100G数据使用 4G 内存排序)

思路很简单,先分段排序,存储到临时文件中,然后合并.

使用10000个整数来模拟大数据,每次读取100个到内存中.

#include <stdint.h>

#include <stdlib.h>

#include <stdio.h>

enum

{

    enmMaxFileNameLen = ,

};

void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName);

void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName);

void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp);

void RandomGenArrInt(int32_t arrInt[], int32_t arrSize);

void RandomGenData(int32_t numberCount);

void QSort(int32_t arrInt[], int32_t start, int32_t end);

void BigDataSort(const char *fileName, const int32_t maxNumberInMem);

int32_t Segment(const char *fileName, const int32_t maxNumberInMem);

int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut);

void PrintArrInt(int32_t arr[], int32_t arrSize);

int32_t main()

{

    RandomGenData();

    BigDataSort("data_10000.txt", );

    getchar();

    return ;

}

void RandomGenArrInt(int32_t arrInt[], int32_t arrSize)

{

    for (int32_t i = ; i < arrSize; ++i)

    {

        arrInt[i] = i + ;

    }

    for (int32_t i = ; i <= arrSize; ++i)

    {

        int32_t m = rand() % arrSize;

        int32_t n = rand() % arrSize;

        int32_t tmp = arrInt[m];

        arrInt[m] = arrInt[n];

        arrInt[n] = tmp;

    }

}

void SaveArrToFile(int32_t arrInt[], int32_t arrSize, const char *fileName)

{

    FILE *fp = NULL;

    fopen_s(&fp, fileName, "w");

    if (!fp)

    {

        printf("open %s failed!\n", fileName);

        return;

    }

    for (int32_t i = ; i < arrSize; ++i)

    {

        fprintf_s(fp, "%d,", arrInt[i]);

    }

    fclose(fp);

    printf("save %s \n", fileName);

}

void RandomGenData(int32_t numberCount)

{

    int32_t *arr = new int32_t[numberCount];

    RandomGenArrInt(arr, numberCount);

    char fileName[enmMaxFileNameLen] = {  };

    sprintf_s(fileName, enmMaxFileNameLen,"data_%d.txt", numberCount);

    SaveArrToFile(arr, numberCount, fileName);

}

void QSort(int32_t arrInt[], int32_t start, int32_t end)

{

    if (start >= end){ return; }

    int32_t i = start, j = end;

    int32_t tmp = arrInt[i];

    while (i < j)

    {

        while (i < j && tmp < arrInt[j])

        {

            --j;

        }

        arrInt[i] = arrInt[j];

        while (i < j && tmp >= arrInt[i])

        {

            ++i;

        }

        arrInt[j] = arrInt[i];

    }

    arrInt[i] = tmp;

    QSort(arrInt, start, i - );

    QSort(arrInt, i + , end);

}

void ReadArrFromFile(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, const char *fileName)

{

    arrSize = ;

    FILE *fp = NULL;

    fopen_s(&fp, fileName, "r");

    if (!fp)

    {

        printf("open %s failed!\n", fileName);

        return;

    }

    while (arrSize < amxArrSize && !feof(fp))

    {

        fscanf_s(fp, "%d,", &arrInt[arrSize++]);

    }

}

void ReadArrFromFilePtr(int32_t arrInt[], int32_t &arrSize, const int32_t amxArrSize, FILE *fp)

{

    arrSize = ;

    while (arrSize < amxArrSize && !feof(fp))

    {

        fscanf_s(fp, "%d,", &arrInt[arrSize]);

        if (!feof(fp))

        {

            ++arrSize;

        }

    }

}

void BigDataSort(const char *fileName, const int32_t maxNumberInMem)

{

    int32_t segFileCount = Segment(fileName, maxNumberInMem);

    int32_t fileIndex = ;

    char fileName1[enmMaxFileNameLen] = {  };

    char fileName2[enmMaxFileNameLen] = {  };

    char fileOut[enmMaxFileNameLen] = {  };

    while (true)

    {

        sprintf_s(fileName1, "%d.txt", fileIndex++);

        sprintf_s(fileName2, "%d.txt", fileIndex++);

        sprintf_s(fileOut, "%d.txt", segFileCount++);

        int32_t ret = MergeTwoFile(fileName1, fileName2, fileOut);

        if (ret != )

        {

            break;

        }

    }

}

int32_t Segment(const char *fileName, const int32_t maxNumberInMem)

{

    int32_t *arr = new int32_t[maxNumberInMem];

    FILE *fp = NULL;

    fopen_s(&fp, fileName, "r");

    if (!fp)

    {

        printf("open %s failed!\n", fileName);

        return ;

    }

    int32_t tmpFileIndex = ;

    while (true)

    {

        int32_t arrSize = ;

        ReadArrFromFilePtr(arr, arrSize, maxNumberInMem, fp);

        if (arrSize == )

        {

            break;

        }

        QSort(arr, , arrSize - );

        char tmpFileName[enmMaxFileNameLen] = {  };

        sprintf_s(tmpFileName, enmMaxFileNameLen, "%d.txt", tmpFileIndex++);

        SaveArrToFile(arr, arrSize, tmpFileName);

    }

    fclose(fp);

    delete[] arr;

    return tmpFileIndex;

}

int32_t MergeTwoFile(const char *fileName1, const char *fileName2, const char *fileOut)

{

    int32_t ret = ;

    FILE *fp1 = NULL, *fp2 = NULL, *fpOut = NULL;

    fopen_s(&fp1, fileName1, "r");

    fopen_s(&fp2, fileName2, "r");

    fopen_s(&fpOut, fileOut, "w");

    if (!fileOut)

    {

        printf("open %s failed!\n", fileOut);

        return ret;

    }

    int32_t val1 = , val2 = ;

    if (fp1){ fscanf_s(fp1, "%d,", &val1); }

    if (fp2){ fscanf_s(fp2, "%d,", &val2); }

    while (fp1 && fp2 && !feof(fp1) && !feof(fp2))

    {

        if (val1 < val2)

        {

            // printf("%d ", val1);

            fprintf_s(fpOut, "%d,", val1);

            fscanf_s(fp1, "%d,", &val1);

        }

        else

        {

            // printf("%d ", val2);

            fprintf_s(fpOut, "%d,", val2);

            fscanf_s(fp2, "%d,", &val2);

        }

        ret = ;

    }

    while (fp1 && !feof(fp1))

    {

        // printf("%d ", val1);

        fprintf_s(fpOut, "%d,", val1);

        fscanf_s(fp1, "%d,", &val1);

    }

    while (fp2 && !feof(fp2))

    {

        // printf("%d ", val2);

        fprintf_s(fpOut, "%d,", val2);

        fscanf_s(fp2, "%d,", &val2);

    }

    if (fp1){ fclose(fp1); }

    if (fp2){ fclose(fp2); }

    fclose(fpOut);

    printf("save %s \n", fileOut);

    return ret;

}

void PrintArrInt(int32_t arr[], int32_t arrSize)

{

    for (int32_t i = ; i < arrSize; ++i)

    {

        printf("%d ", arr[i]);

    }

}

C++ 大规模数据排序(100G数据使用 4G 内存排序)的更多相关文章

多线程更新已排序的Datagridview数据，造成数据错位
多线程更新已排序的Datagridview数据,触发Datagridview的auto-sort时间,数据重新排序,造成后面更新数据的更新错误. 解决方法: 方法一.设置Datagridview的表头 ...
ASP.NET中Dataset的table数据合并、数据截取、数据排序
1.两个相同字段表的合并: public static DataSet CombineTables(DataSet _ds, DataTable _dt1, DataTable _dt2) { Dat ...
for循环中进行联网请求数据、for循环中进行异步数据操作，数据排序错乱问题解决；
for循环中进行联网请求数据,由于网络请求是异步的,第一个网络请求还没有回调,第二次第三次以及后续的网络请求又已经发出去了,有可能后续的网络请求会先回调:这时我们接收到的数据的排序就会错乱:怎么才能让 ...
java中的排序(自定义数据排序)--使用Collections的sort方法
排序:将一组数据按相应的规则排列顺序 1.规则: 基本数据类型:日常的大小排序. 引用类型: 内置引用类型(String,Integer..),内部已经指定规则,直接使用即可.---- ...
Sortable拖拽排序插件数据筛选
后台有拖拽排序功能,然而前段在开发的时候,一整页的数据都发给后端了. 于是查看前端代码,想到了如下解决办法,即先把排序前的保存,然后对比排序后的,有差异的才发回给后端. var new_ids_ord ...
mysql必知必会(四、检索数据，五、排序检索数据，六、过滤数据，七、数据过滤)
四.select语句 1.检索单个列 select prod_name from products; 2.检索多个列 select prod_name, prod_price from product ...
2.排序检索数据 ---SQL
order by 一.排序数据 SELECT prod_name FROM Products ORDER BY prod_name; ORDER BY子句的位置在指定一条ORDER BY子句时,应该 ...
Spark SQL - 对大规模的结构化数据进行批处理和流式处理
Spark SQL - 对大规模的结构化数据进行批处理和流式处理大体翻译自:https://jaceklaskowski.gitbooks.io/mastering-apache-spark/con ...
MySql——创建数据表，查询数据，排序查询数据
参考资料:<Mysql必知必会> 创建数据表在学习前首先创建数据表和插入数据.如何安装mysql可以看看上个博客https://www.cnblogs.com/lbhym/p/11675 ...

随机推荐

web页面效果开源框架收集整合
1.EasyUI:http://www.jeasyui.com/index.php 2.Bootstrap:http://www.bootcss.com/ 3.jqueryui:http://jque ...
AJAX中的同步加载与异步加载
AJAX是四个单词的简写,其中Asynchronous即异步的意思,异步的链接可以同时发起多个,并且不会阻止JS代码执行.与之对应的概念是同步,同步的链接在同一时刻只会有一个,并且会阻止后续JS代码的 ...
java模拟生日发祝福
1.新建customer表生日都选为当天所需jar包 2.使用c3p0连接到数据的xml配置文件 3.连接数据库的工具类 package com.cc.birthday; import java.s ...
javaweb核心技术servlet
一.Servlet简介 1．什么是Servlet Servlet 运行在服务端的Java小程序,是sun公司提供一套规范(接口),用来处理客户端请求.响应给浏览器的动态资源.但servlet的实质 ...
POJ 2689.Prime Distance-区间筛素数
最近改自己的错误代码改到要上天,心累. 这是迄今为止写的最心累的博客. Prime Distance Time Limit: 1000MS Memory Limit: 65536K Total S ...
新疆大学ACM-ICPC程序设计竞赛五月月赛（同步赛）F 猴子排序的期望【Java/高精度/组合数学+概率论】
链接:https://www.nowcoder.com/acm/contest/116/F 来源:牛客网题目描述我们知道有一种神奇的排序方法叫做猴子排序,就是把待排序的数字写在卡片上,然后让猴子把 ...
Codeforces Round #447 (Div. 2) B. Ralph And His Magic Field【数论/组合数学】
B. Ralph And His Magic Field time limit per test 1 second memory limit per test 256 megabytes input ...
poj2976（01分数规划）
poj2976 题意给出 a b 数组,一共 n 对数,其中最多可以去掉 k 对,问怎样使剩下比率(原始比率是 $ \frac{\sum_{i=1}^{n} a}{\sum_{i=1}^{n} b} ...
洛谷—— P1022 计算器的改良
P1022 计算器的改良题目背景 NCL是一家专门从事计算器改良与升级的实验室,最近该实验室收到了某公司所委托的一个任务:需要在该公司某型号的计算器上加上解一元一次方程的功能.实验室将这个任务交给了 ...
sort equal 确保记录按照 input顺序来
Usually you have a requirement of removing the duplicate records from a file using SORT with the opt ...

C++ 大规模数据排序(100G数据 使用 4G 内存 排序)

C++ 大规模数据排序(100G数据 使用 4G 内存 排序)的更多相关文章

随机推荐

热门专题

C++ 大规模数据排序(100G数据使用 4G 内存排序)

C++ 大规模数据排序(100G数据使用 4G 内存排序)的更多相关文章