外部排序是一类可以处理大量数据的排序算法的术语。如果要排序的数据不适合计算设备的主内存(通常是RAM),而必须将它们驻留在速度较慢的外部内存(通常是硬盘驱动器)中,则需要进行外部排序。外部排序通常使用混合排序合并策略。在排序阶段,读取足够小的数据块以适合主存储器,然后将其分块并写到临时文件中。在合并阶段,将排序后的子文件合并为一个更大的文件。
外部排序的一个示例是外部合并排序算法,该算法对每个适合RAM的块进行排序,然后将排序后的块合并在一起。我们首先将文件划分为多个运行,以使运行的大小足够小以适合主内存。然后使用合并排序排序算法对主内存中的每个运行进行排序。最后,将结果运行合并到更大的运行中,直到对文件进行排序。
算法/代码的前提条件:
MergeSort:用于对单个运行进行排序(运行是文件的一部分,该文件足够小以适合主内存)
合并K个排序的数组:用于合并排序的运行。
以下是C++实现中使用的步骤。
输入:
input_file : Name of input file. input.txt
output_file : Name of output file, output.txt
run_size : Size of a run (can fit in RAM)
num_ways : Number of runs to be merged
解决方案:
这个想法很简单,所有元素都不能立即排序,因为它们的大小很大。因此,将数据分为多个块,然后使用合并排序进行排序。然后将排序后的数据转储到文件中。因此,无法完全处理如此大量的数据。现在对各个块进行排序之后。通过合并k个排序的数组对整个数组进行排序。
算法:
- 读取input_file,以便一次最多读取“ run_size”个元素。对数组中的每次运行都执行以下操作。
- 使用MergeSort对运行进行排序。
- 将排序后的数组存储在文件中。让我们为第一个文件说“ i”。
- 使用讨论的方法合并排序的文件合并k个排序的数组
以下是上述步骤的C++实现。
// C++ program to implement
// external sorting using
// merge sort
#include
using namespace std;
struct MinHeapNode {
// The element to be stored
int element;
// index of the array from which
// the element is taken
int i;
};
// Prototype of a utility function
// to swap two min heap nodes
void swap(MinHeapNode* x, MinHeapNode* y);
// A class for Min Heap
class MinHeap {
// pointer to array of elements in heap
MinHeapNode* harr;
// size of min heap
int heap_size;
public:
// Constructor: creates a min
// heap of given size
MinHeap(MinHeapNode a[], int size);
// to heapify a subtree with
// root at given index
void MinHeapify(int);
// to get index of left child
// of node at index i
int left(int i) { return (2 * i + 1); }
// to get index of right child
// of node at index i
int right(int i) { return (2 * i + 2); }
// to get the root
MinHeapNode getMin() { return harr[0]; }
// to replace root with new node
// x and heapify() new root
void replaceMin(MinHeapNode x)
{
harr[0] = x;
MinHeapify(0);
}
};
// Constructor: Builds a heap from
// a given array a[] of given size
MinHeap::MinHeap(MinHeapNode a[], int size)
{
heap_size = size;
harr = a; // store address of array
int i = (heap_size - 1) / 2;
while (i >= 0) {
MinHeapify(i);
i--;
}
}
// A recursive method to heapify
// a subtree with root
// at given index. This method
// assumes that the
// subtrees are already heapified
void MinHeap::MinHeapify(int i)
{
int l = left(i);
int r = right(i);
int smallest = i;
if (l < heap_size && harr[l].element < harr[i].element)
smallest = l;
if (r < heap_size && harr[r].element < harr[smallest].element)
smallest = r;
if (smallest != i) {
swap(&harr[i], &harr[smallest]);
MinHeapify(smallest);
}
}
// A utility function to swap two elements
void swap(MinHeapNode* x, MinHeapNode* y)
{
MinHeapNode temp = *x;
*x = *y;
*y = temp;
}
// Merges two subarrays of arr[].
// First subarray is arr[l..m]
// Second subarray is arr[m+1..r]
void merge(int arr[], int l, int m, int r)
{
int i, j, k;
int n1 = m - l + 1;
int n2 = r - m;
/* create temp arrays */
int L[n1], R[n2];
/* Copy data to temp arrays L[] and R[] */
for (i = 0; i < n1; i++)
L[i] = arr[l + i];
for (j = 0; j < n2; j++)
R[j] = arr[m + 1 + j];
/* Merge the temp arrays back into arr[l..r]*/
// Initial index of first subarray
i = 0;
// Initial index of second subarray
j = 0;
// Initial index of merged subarray
k = l;
while (i < n1 && j < n2) {
if (L[i] <= R[j])
arr[k++] = L[i++];
else
arr[k++] = R[j++];
}
/* Copy the remaining elements of L[],
if there are any */
while (i < n1)
arr[k++] = L[i++];
/* Copy the remaining elements of R[],
if there are any */
while (j < n2)
arr[k++] = R[j++];
}
/* l is for left index and r is right index of the
sub-array of arr to be sorted */
void mergeSort(int arr[], int l, int r)
{
if (l < r) {
// Same as (l+r)/2, but avoids overflow for
// large l and h
int m = l + (r - l) / 2;
// Sort first and second halves
mergeSort(arr, l, m);
mergeSort(arr, m + 1, r);
merge(arr, l, m, r);
}
}
FILE* openFile(char* fileName, char* mode)
{
FILE* fp = fopen(fileName, mode);
if (fp == NULL) {
perror("Error while opening the file.\n");
exit(EXIT_FAILURE);
}
return fp;
}
// Merges k sorted files. Names of files are assumed
// to be 1, 2, 3, ... k
void mergeFiles(char* output_file, int n, int k)
{
FILE* in[k];
for (int i = 0; i < k; i++) {
char fileName[2];
// convert i to string
snprintf(fileName, sizeof(fileName),
"%d", i);
// Open output files in read mode.
in[i] = openFile(fileName, "r");
}
// FINAL OUTPUT FILE
FILE* out = openFile(output_file, "w");
// Create a min heap with k heap
// nodes. Every heap node
// has first element of scratch
// output file
MinHeapNode* harr = new MinHeapNode[k];
int i;
for (i = 0; i < k; i++) {
// break if no output file is empty and
// index i will be no. of input files
if (fscanf(in[i], "%d ", &harr[i].element) != 1)
break;
// Index of scratch output file
harr[i].i = i;
}
// Create the heap
MinHeap hp(harr, i);
int count = 0;
// Now one by one get the
// minimum element from min
// heap and replace it with
// next element.
// run till all filled input
// files reach EOF
while (count != i) {
// Get the minimum element
// and store it in output file
MinHeapNode root = hp.getMin();
fprintf(out, "%d ", root.element);
// Find the next element that
// will replace current
// root of heap. The next element
// belongs to same
// input file as the current min element.
if (fscanf(in[root.i], "%d ",
&root.element)
!= 1) {
root.element = INT_MAX;
count++;
}
// Replace root with next
// element of input file
hp.replaceMin(root);
}
// close input and output files
for (int i = 0; i < k; i++)
fclose(in[i]);
fclose(out);
}
// Using a merge-sort algorithm,
// create the initial runs
// and divide them evenly among
// the output files
void createInitialRuns(
char* input_file, int run_size,
int num_ways)
{
// For big input file
FILE* in = openFile(input_file, "r");
// output scratch files
FILE* out[num_ways];
char fileName[2];
for (int i = 0; i < num_ways; i++) {
// convert i to string
snprintf(fileName, sizeof(fileName),
"%d", i);
// Open output files in write mode.
out[i] = openFile(fileName, "w");
}
// allocate a dynamic array large enough
// to accommodate runs of size run_size
int* arr = (int*)malloc(
run_size * sizeof(int));
bool more_input = true;
int next_output_file = 0;
int i;
while (more_input) {
// write run_size elements
// into arr from input file
for (i = 0; i < run_size; i++) {
if (fscanf(in, "%d ", &arr[i]) != 1) {
more_input = false;
break;
}
}
// sort array using merge sort
mergeSort(arr, 0, i - 1);
// write the records to the
// appropriate scratch output file
// can't assume that the loop
// runs to run_size
// since the last run's length
// may be less than run_size
for (int j = 0; j < i; j++)
fprintf(out[next_output_file],
"%d ", arr[j]);
next_output_file++;
}
// close input and output files
for (int i = 0; i < num_ways; i++)
fclose(out[i]);
fclose(in);
}
// For sorting data stored on disk
void externalSort(
char* input_file, char* output_file,
int num_ways, int run_size)
{
// read the input file,
// create the initial runs,
// and assign the runs to
// the scratch output files
createInitialRuns(input_file,
run_size, num_ways);
// Merge the runs using
// the K-way merging
mergeFiles(output_file, run_size, num_ways);
}
// Driver program to test above
int main()
{
// No. of Partitions of input file.
int num_ways = 10;
// The size of each partition
int run_size = 1000;
char input_file[] = "input.txt";
char output_file[] = "output.txt";
FILE* in = openFile(input_file, "w");
srand(time(NULL));
// generate input
for (int i = 0; i < num_ways * run_size; i++)
fprintf(in, "%d ", rand());
fclose(in);
externalSort(input_file, output_file, num_ways,
run_size);
return 0;
}
复杂度分析:
- 时间复杂度: O(n + run_size log run_size)。
合并排序所花费的时间为O(nlogn),但最多有run_size个元素。因此,时间复杂度为O(run_size log run_size),然后合并排序后的数组,时间复杂度为O(n)。因此,总体时间复杂度为O(n + run_size log run_size)。 - 辅助空间: O(run_size)。
run_size是存储数组所需的空间。
注意:此代码在在线编译器上不起作用,因为它需要文件创建权限。在本地计算机上运行时,它将生成带有10000个随机数的示例输入文件“ input.txt”。它对数字进行排序,然后将排序后的数字放入“ output.txt”文件中。它还会生成名称为1、2 …的文件,以存储排序的运行。
参考:
- https://zh.wikipedia.org/wiki/外部排序
- http://web.eecs.utk.edu/~leparker/Courses/CS302-Fall06/Notes/external-sorting2.html