在每个文档中,
records是一个包含许多重复对象的数组。
buy_items中也包含许多重复项。
我怎样才能把重复的东西清理干净?
原始文件:

{
  "_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
  "records": [
    {
      "DATE": new Date("1996-02-08T08:00:00+0800"),
      "buy_items": [
        "5210 ",
        "5210 ",
        "5210 "
      ]
    },
    {
      "DATE": new Date("1996-02-08T08:00:00+0800"),
      "buy_items": [
        "5210 ",
        "5210 ",
        "5210 "
      ]
    }
    {
      "DATE": new Date("2012-12-08T08:00:00+0800"),
      "buy_items": [
        "5210 ",
        "1234 ",
        " "
      ]
    }
    ]
}

预期产量:
{
  "_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
  "records": [
    {
      "DATE": new Date("1996-02-08T08:00:00+0800"),
      "buy_items": [
        "5210 "
      ]
    },
    {
      "DATE": new Date("2012-12-08T08:00:00+0800"),
      "buy_items": [
        "5210 ",
        "1234 ",
        " "
      ]
    }
    ]
}

使用Michaels解决方案,输出可能如下所示
{
  "_id": "0005d116qwwewdq82a1b84f148fa6027d429f3e",
  "records": [
    "date": new Date("1996-02-08T08:00:00+0800"),
      "buy_items": [
        "5210 "
        "1234 ",
        " "
      ]
    ]
}

最佳答案

您可以使用aggregation framework

db.collection.aggregate(
    [
        { $unwind: "$records" },
        { $unwind: "$records.buy_items" },
        { $group: { "_id": {id: "$_id", date: "$records.DATE" }, buy_items: { $addToSet: "$records.buy_items" }}},
        { $group: {"_id": "$_id.id", records: { $push: {"date": "$_id.date", "buy_items": "$buy_items" }}}}, { $sort: { "records.0.date": 1 }} ,
        { $out: "collection" }
    ]
)

$out运算符允许您将聚合结果写入指定集合或替换现有集合。
更好地使用"Bulk"操作
var bulk = bulk = db.collection.initializeOrderedBulkOp(),
    count = 0;

db.collection.aggregate([
    { "$unwind": "$records" },
    { "$project": {
        "date": "$records.DATE",
        "buy_items": { "$setIntersection": "$records.buy_items" }
    }},
    { "$unwind": "$buy_items" },
    { "$group": {
        "_id": { "id": "$_id", "date": "$date" },
        "buy_items": { "$addToSet": "$buy_items" }
    }},
    { "$group": {
        "_id": "$_id.id",
        "records": { "$push": {
            "date": "$_id.date",
            "buy_items": "$buy_items"
        }}
    }}
]).forEach(function(doc) {
       bulk.find({"_id": doc._id}).updateOne({
       "$set": { "records": doc.records }
       });
       count++;
       if (count % 500 == 0) {
           bulk.execute();
           bulk = db.collection.initializeOrderedBulkOp();
       }
})

if (count % 500 != 0)
    bulk.execute();

结果:
{
    "_id" : "0005d116qwwewdq82a1b84f148fa6027d429f3e",
    "records" : [
            {
                    "date" : ISODate("2012-12-08T00:00:00Z"),
                    "buy_items" : [
                            " ",
                            "1234 ",
                            "5210 "
                    ]
            },
            {
                    "date" : ISODate("1996-02-08T00:00:00Z"),
                    "buy_items" : [
                            "5210 "
                    ]
            }
    ]
}

10-05 17:54
查看更多