Last Updated: February 25, 2016
·
3.543K
· themichael'tips

MongoDB Map-Reduce: choice of key

Introduction

The keys used for the Map-Reduce pattern can be one of this three types:

  • single value: integer, float, string...
  • object: Date Object, NumberLong Object...
  • set of data (document): { indexa : 123, indexb : new Date(2015,0,1) }

This tutorial show three examples for each type to resolve the following problem.

The problem

Let's consider the below document j4

>db.j4.find()
{ "_id" : 1, "value" : { "timeline" : ISODate("2015-07-01T01:23:03Z") } }
{ "_id" : 2, "value" : { "timeline" : ISODate("2015-07-01T05:00:00Z") } }
{ "_id" : 3, "value" : { "timeline" : ISODate("2015-07-03T13:02:14Z") } }
{ "_id" : 4, "value" : { "timeline" : ISODate("2015-07-03T20:10:06Z") } }
{ "_id" : 5, "value" : { "timeline" : ISODate("2015-07-03T21:03:07Z") } }
{ "_id" : 6, "value" : {  } }

Here the problem: count the elements of j4 grouped by yyyy-mm-dd.

The output should look like this:

date : 2015-07-01, total : 2
date : 2015-07-03, total : 3

Solution 1: key as date string

A date can be treated as string if there is no need to do complex query over it.<br>
This solution transform a date object to a string with padding and use the string as key.

var map_use_pad = function() { 

    if (this.value.timeline == null) return;

    // for this pad function, thanks to https://stackoverflow.com/users/182668/pointy
    var pad = function pad(n, width, z) {
      z = z || '0';
      n = n + '';
      return n.length >= width ? n : new Array(width - n.length + 1).join(z) + n;
    };

    var d = this.value.timeline;
    var key = d.getFullYear() + "-" + pad(d.getMonth() + 1, 2, 0) + "-" + pad(d.getDate(), 2, 0);

    emit(key, {"tot":1}); 
};

var reduce_with_pad = function (key, values) {

    var reduce = {"tot" : 0};

    /* if date as object is needed, use this instead */
    // var reduce = {"date_as_object" : new Date(key + "T00:00:00.000Z"), "tot" : 0};

    values.forEach(function(value){
        reduce.tot += value.tot;
    });

    return reduce;
}

Now is possible use map-reduce function over the j4 document as follow:

>db.j4.mapReduce(map_use_pad, reduce_with_pad, {out : {reduce: 'out_pad' }});
{
    "result" : "out_pad",
    "timeMillis" : 81,
    "counts" : {
        "input" : 6,
        "emit" : 5,
        "reduce" : 2,
        "output" : 2
    },
    "ok" : 1
}

Let's take a look to the result:

> db.out_pad.find()
{ "_id" : "2015-07-01", "value" : { "tot" : 2 } }
{ "_id" : "2015-07-03", "value" : { "tot" : 3 } }

Solution 2: key as date object

Once the problem is solved and the document generated, may be necessary to operate queries over date (_id) like the following (pseudo-code):

db.out_pad.find({_id > "2015-01-01" AND _id < "2015-07-02"})       W R O N G!!!

As long as _id is a string, only complex regular expression can be used to make the pseudo-code.

This second solution solve the issue: key (_id) is a date object.

var map_use_object = function() { 

    if (this.value.timeline == null) return;

    var d = this.value.timeline;
    var key = new Date( Date.UTC(d.getFullYear(), d.getMonth(), d.getDate()) );

    emit(key, {"tot":1}); 
};

var reduce_with_object = function (key, values) {

    var reduce = {"tot" : 0};

    values.forEach(function(value){
        reduce.tot += value.tot;
    });

    return reduce;
}

Once generated the result using...

db.j4.mapReduce(map_use_object, reduce_with_object, {out : {reduce: 'out_object' }});

...is possible to perform the expected query:

>db.out_object.find
(
    { 
        _id:{
                $gt: new Date("2015-01-01T00:00:00Z"), 
                $lt: new Date("2015-07-02T00:00:00Z")
            }
    }
)
{ "_id" : ISODate("2015-07-01T00:00:00Z"), "value" : { "tot" : 2 } }

Solution 3: key as document

The most easy way to split a date in a unique group of keys is something like this:

key = { year: yyyy, month: mm, day: dd }

This key allow the easy construction of certain kind of queries.

var map_use_set = function() { 

    if (this.value.timeline == null) return;

    var d = this.value.timeline;
    var key = { year:d.getFullYear(), month:d.getMonth()+1, day:d.getDate() };

    emit(key, {"tot":1}); 
};

var reduce_with_set = function (key, values) {

    var reduce = {"tot" : 0};

    values.forEach(function(value){
        reduce.tot += value.tot;
    });

return reduce;
}

Once generated the result using...

db.j4.mapReduce(map_use_set, reduce_with_set, {out : {reduce: 'out_set' }});

...is possible to know the total for every first day of 2015 with a simple query:

> db.out_set.find({"_id.year":2015, "_id.day" : 1})
{ "_id" : { "year" : 2015, "month" : 7, "day" : 1 }, "value" : { "tot" : 2 } }

Conclusion

The choice of map-reduce key has an important consequence of the kind of queries that are available once that data are generated. It's the nature of the queries that impose the kind of key to use.