tsJensen

A quest for software excellence...

Merge Algorithm for Multiple Sorted IEnumerable<T> Sources

This evening I was asked to write a merge algorithm to efficiently merge multiple iterator sources, yielding a merged iterator that would not require the algorithm to read all of the data into memory should the sources be very large. I’ve never written such an algorithm nor can I recall seeing one, so I didn’t have a very good good answer. Of course that left a simmering thread of though on the back burner of my brain.

After letting it rattle around a bit and without resorting to old fashioned Googling, I sat down and banged out the following code. It was fun to write and works but it took me much too long to write from scratch—about 90 minutes. It may be time to refresh and reload, perhaps by writing a series of posts that implement C# versions of selected algorithms found in a book I recently purchased but have since spent no time reading: Introduction to Algorithms 3rd Edition.

Updated Code (9/6/2014)The original code gets a big performance boost with this refactoring:

public static IEnumerable<T> SortedMerge<T>
  (params IEnumerable<T>[] sortedSources)
  where T : IComparable
{
  if (sortedSources == null || sortedSources.Length == 0)
    throw new ArgumentNullException("sortedSources");

  //1. fetch enumerators for each sourc
  var enums = (from n in sortedSources
         select n.GetEnumerator()).ToArray();

  //2. create index list indicating what MoveNext returned for each enumerator
  var enumHasValue = new List<bool>(enums.Length);
  // MoveNext on all and initialize enumHasValue
  for (int i = 0; i < enums.Length; i++)
  {
    enumHasValue.Add(enums[i].MoveNext());
  }

  // if all false, nothing to iterate over
  if (enumHasValue.All(x => !x)) yield break;

  //3. loop through
  while (true)
  {
    //find index with lowest value
    var lowIdx = -1;
    T lowVal = default(T);
    for (int i = 0; i < enums.Length; i++)
    {
      if (enumHasValue[i])
      {
        // must get first before doing any compares
        if (lowIdx < 0 
		    || null == enums[i].Current //null sorts lowest
		    || enums[i].Current.CompareTo(lowVal) < 0)
        {
          lowIdx = i;
          lowVal = enums[i].Current;
        }
      }
    }

    //if none found, we're done
    if (lowIdx < 0) break;

    //get next value for enumerator chosen
    enumHasValue[lowIdx] = enums[lowIdx].MoveNext();

    //yield up the lowest value
    yield return lowVal;
  }
}

Here’s the original code. I hope you enjoy it. And if you see ways to improve on it, please let me know.

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace Merger
{
  class Program
  {
    static void Main(string[] args)
    {
      int[] a = { 1, 3, 6, 102, 105, 230 };
      int[] b = { 101, 103, 112, 155, 231 };

      var mm = new MergeMania();

      foreach(var val in mm.Merge<int>(a, b))
      {
        Console.WriteLine(val);
      }
      Console.ReadLine();
    }
  }

  public class MergeMania
  {
    public IEnumerable<T> Merge<T>(params IEnumerable<T>[] sortedSources) 
      where T : IComparable
    {
      if (sortedSources == null || sortedSources.Length == 0) 
        throw new ArgumentNullException("sortedSources");
      
      //1. fetch enumerators for each sourc
      var enums = (from n in sortedSources 
             select n.GetEnumerator()).ToArray();
      
      //2. fetch enumerators that have at least one value
      var enumsWithValues = (from n in enums 
                   where n.MoveNext() 
                   select n).ToArray();
      if (enumsWithValues.Length == 0) yield break; //nothing to iterate over
       
      //3. sort by current value in List<IEnumerator<T>>
      var enumsByCurrent = (from n in enumsWithValues 
                  orderby n.Current 
                  select n).ToList();
      //4. loop through
      while (true)
      {
        //yield up the lowest value
        yield return enumsByCurrent[0].Current;

        //move the pointer on the enumerator with that lowest value
        if (!enumsByCurrent[0].MoveNext())
        {
          //remove the first item in the list
          enumsByCurrent.RemoveAt(0);

          //check for empty
          if (enumsByCurrent.Count == 0) break; //we're done
        }
        enumsByCurrent = enumsByCurrent.OrderBy(x => x.Current).ToList();
      }
    }
  }
}

And if this answers any questions for you, please do drop me a line to let me know.