MongoDB的MapReduce相当于Mysql中的group
使用MapReduce要实现两个函数 Map Function 和 Reduce Function在调用mapReduce时需要用到这两个函数
? 1 >db.things.mapReduce(Map Function, Reduce Function, [output | option])Map Function 调用emit(key, value),遍历collection中所有的记录,将key与value传递给Reduce Function进行处理collection things中有如下记录
? 1 2 3 4 5 > db.things. find () { "_id" : 1, "tags" : [ "dog" , "cat" ] } { "_id" : 2, "tags" : [ "cat" ] } { "_id" : 3, "tags" : [ "mouse" , "cat" , "dog" ] } { "_id" : 4, "tags" : [ ] }Map Function
? 1 2 3 4 5 >m = function () { ... this.tags.forEach( function (z) { ... emit(z, {count:1}); ... }); ... }m函数扫描每条记录的tags,将tags的每个元素如:“dog",“cat”……作为key,{count : 1}作为value,如:{"dog", { count : 1}},{"cat", { count : 1}},将这些{ key : value}(注: 经过聚集的)传递给Reduce Function
? 1 2 3 4 5 6 > r = function (key, values) { ... var total = 0; ... for (var i = 0; i < values.length; i++) ... total += values[i].count; ... return {count : total}; ... };r函数统计每个tag的个数,r的返回结果要与emit函数的value格式一致(官方文档说如果不一致的话,bug很难调试) 。r函数调用的方式如下:
? 1 2 3 r( "cat" , [ { count : 1 }, { count : 1 }, { count : 1} ] ); r( "dog" , [ { count : 1 }, { count : 1 } ] ); r( "mouse" , [ { count : 1 } ]);执行mapReduce()
? 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 >res = db.things.mapReduce(m,r, { out : {replace : 'things_reduce' }}); { "result" : "things_reduce" , "timeMillis" : 4, "counts" : { "input" : 4, "emit" : 6, "output" : 3 }, "ok" : 1, } >db[res.result].find() { "_id" : "cat" , "value" : { "count" : 3 } } { "_id" : "dog" , "value" : { "count" : 2 } } { "_id" : "mouse" , "value" : { "count" : 1 } }在文档中output选项是可选的,但在操作过程中发现,没有{out : {replace : 'things_reduce'}}会报错。
db.collection.mapReduce(mapfunction,reducefunction[,options]);
输出选项结构如下:
{ "out", option }
option可以是下面几个选项:
"collection name" – mapReduce的输出结果会替换掉原来的collection,collection不存在则创建 { replace : "collection name" } – 同上 { merge : "collection name" } – 将新老数据进行合并,新的替换旧的,没有的添加进去 { reduce : "collection name" } – 存在老数据时,在原来基础上加新数据(即 new value = old value + mapReduce value) { inline : 1 } – 不会创建collection,结果保存在内存里,只限于结果小于16MB的情况如果用collection name作option不能与其它option一起使用,其它则可以,如:
{ "out", { replace : "collection name", db : "db name" } }
PS: 还有哪些选项,不清楚,没在文档里看到,以后再补。
非 { inline : 1 }的情况,会创建一个名为collection name的collection
? 1 2 3 4 5 6 7 8 > show collections system.indexes things things_reduce > db.things_reduce. find () { "_id" : "cat" , "value" : { "count" : 3 } } { "_id" : "dog" , "value" : { "count" : 2 } } { "_id" : "mouse" , "value" : { "count" : 1 } }另一个例子:
? 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 > db.foo. find () { "_id" : ObjectId( "4da54867beb0fbf627f15179" ), "username" : "jones" , "likes" : 20, "text" : "Hello world!" } { "_id" : ObjectId( "4da560e0beb0fbf627f1517a" ), "username" : "jones" , "likes" : 5, "text" : "Hello world aaaaaaaaaa!" } { "_id" : ObjectId( "4da560fdbeb0fbf627f1517b" ), "username" : "chy" , "likes" : 15, "text" : "Hello world bbbbbbbbbb!" } > m function () { emit(this.username, {count:1, likes:this.likes}); } > f function (key, values) { var result = {count:0, likes:0}; values.forEach( function (value) {result.count += value.count;result.likes += value.likes;}); return result; } > res = db.foo.mapReduce(m, f, {out: {replace: "test_result" }}); { "result" : "test_result" , "timeMillis" : 4, "counts" : { "input" : 3, "emit" : 3, "output" : 2 }, "ok" : 1, } > db.test_result. find () { "_id" : "chy" , "value" : { "count" : 1, "likes" : 15 } } { "_id" : "jones" , "value" : { "count" : 2, "likes" : 25 } }将{out: {replace: "test_result"}}改为{out: {reduce: "test_result"}}的话,可以看到没运行一次res = db.foo.mapReduce(m, f, {out: {replace: "test_result"}});结果就会增加,如:
? 1 2 3 > db.test_result. find () { "_id" : "jones" , "value" : { "count" : 5, "likes" : 70 } } { "_id" : "chy" , "value" : { "count" : 2, "likes" : 30 } }