问题描述
我是 mongo 聚合的新手,我需要帮助来创建聚合,
I'm new with mongo aggregations, and I need a help with creating one,
我以以下文档的集合为例:
I have a collection of the following document as an example:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 1526476550940,
"timeSpent" : 3180
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"timestamp" : 1625476550940,
"timeSpent" : 318000,
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"timestamp" : 15211276550940,
"timeSpent" : 318011
}
],
"variables" : [
{
"_id" : "cd4318a83c9b-a8478d76bfd3e4b6-5967",
"name" : "Customer Profile",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "366cb8c07996-c62c37a87a86d526-d3e7",
"name" : "Target Telephony Queue",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "4ed84742da33-d70ba8a809b712f3-bdf4",
"name" : "IMEI",
"lastValue" : "",
"values" : [],
"additionalData" : {}
},
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
"timestamp" : NumberLong(1526476550940),
"element" : "c8103687c1c8-97d749e349d785c8-9154"
}
],
"additionalData" : {}
}
]
}
我需要一个结果文档,显示集合中每个旅行者访问每个城市的次数,以及平均预算(预算是变量数组中的一个元素
I need to have a resulting document showing the how many times each city have been visited by each traveler in the collection, and the average budget (budget is an element in the variables array
因此生成的文档将类似于:
so the resulting document will be similar to:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited":2
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited":1
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"values" : [
{
"value" : "3000",
}
],
}
],
}
感谢您的帮助
推荐答案
作为快速说明,您需要更改 "values"
中的 "value"
字段code> 是数字,因为它目前是一个字符串.但继续回答:
As a quick note, you need to change your "value"
field inside the "values"
to be numeric, since it's presently a string. But on to the answer:
如果您有权访问 $reduce
来自 MongoDB 3.4,那么你实际上可以做这样的事情:
If you have access to $reduce
from MongoDB 3.4, then you can actually do something like this:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
如果你有 MongoDB 3.6,你可以使用 $mergeObjects
:
If you have MongoDB 3.6, you can clean that up a bit with $mergeObjects
:
db.collection.aggregate([
{ "$addFields": {
"cities": {
"$reduce": {
"input": "$cities",
"initialValue": [],
"in": {
"$cond": {
"if": { "$ne": [{ "$indexOfArray": ["$$value._id", "$$this._id"] }, -1] },
"then": {
"$concatArrays": [
{ "$filter": {
"input": "$$value",
"as": "v",
"cond": { "$ne": [ "$$this._id", "$$v._id" ] }
}},
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": {
"$add": [
{ "$arrayElemAt": [
"$$value.visited",
{ "$indexOfArray": [ "$$value._id", "$$this._id" ] }
]},
1
]
}
}]
]
},
"else": {
"$concatArrays": [
"$$value",
[{
"_id": "$$this._id",
"name": "$$this.name",
"visited": 1
}]
]
}
}
}
}
},
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"$mergeObjects": [
"$$this",
{ "values": { "$avg": "$$this.values.value" } }
]
}
}
}
}}
])
但是除了我们保留 additionalData
回到之前一点,然后你总是可以$unwind
要累积的城市"
:
Going back a little before that, then you can always $unwind
the "cities"
to accumulate:
db.collection.aggregate([
{ "$unwind": "$cities" },
{ "$group": {
"_id": {
"_id": "$_id",
"cities": {
"_id": "$cities._id",
"name": "$cities.name"
}
},
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"variables": { "$first": "$variables" },
"visited": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id._id",
"_class": { "$first": "$class" },
"name": { "$first": "$name" },
"startTimestamp": { "$first": "$startTimestamp" },
"endTimestamp" : { "$first": "$endTimestamp" },
"source" : { "$first": "$source" },
"cities": {
"$push": {
"_id": "$_id.cities._id",
"name": "$_id.cities.name",
"visited": "$visited"
}
},
"variables": { "$first": "$variables" },
}},
{ "$addFields": {
"variables": {
"$map": {
"input": {
"$filter": {
"input": "$variables",
"cond": { "$eq": ["$$this.name", "Budget"] }
}
},
"in": {
"_id": "$$this._id",
"name": "$$this.name",
"defaultValue": "$$this.defaultValue",
"lastValue": "$$this.lastValue",
"value": { "$avg": "$$this.values.value" }
}
}
}
}}
])
所有返回(几乎)相同的东西:
All return (almost) the same thing:
{
"_id" : ObjectId("5afc2f06e1da131c9802071e"),
"_class" : "Traveler",
"name" : "John Due",
"startTimestamp" : 1526476550933,
"endTimestamp" : 1526476554823,
"source" : "istanbul",
"cities" : [
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-1122",
"name" : "Cairo",
"visited" : 1
},
{
"_id" : "ef8f6b26328f-0663202f94faeaeb-3981",
"name" : "Moscow",
"visited" : 2
}
],
"variables" : [
{
"_id" : "c8103687c1c8-97d749e349d785c8-9154",
"name" : "Budget",
"defaultValue" : "",
"lastValue" : "",
"value" : 3000
}
]
}
前两种形式当然是最理想的做法,因为它们只是在内部"工作.始终使用相同的文档.
The first two forms are of course the most optimal thing to do since they are simply working "within" the same document at all times.
像 $reduce
允许累积"数组上的表达式,所以我们可以在这里使用它来保持减少"我们使用 "_id" 值的数组noreferrer">$indexOfArray
以查看是否已经存在匹配的累积项.-1
的结果意味着它不存在.
Operators like $reduce
allow "accumulation" expressions on arrays, so we can use it here to keep a "reduced" array which we test for the unique "_id"
value using $indexOfArray
in order to see if the there already is an accumulated item that matches. A result of -1
means it's not there.
为了构造一个简化数组"我们将 []
的 initialValue"
作为一个空数组,然后通过 $concatArrays
.所有的过程都是通过三元"来决定的.$cond
运算符,它考虑了if"
条件和 then"
$filter
的输出在当前 $$value
上排除当前索引 _id
条目,当然还有另一个数组"表示单个对象.
In order construct a "reduced array" we take the "initialValue"
of []
as an empty array and then add to it via $concatArrays
. All of that process is decided via the "ternary" $cond
operator which considers the "if"
condition and "then"
either "joins" the output of the $filter
on the current $$value
to exclude the current index _id
entry, with of course another "array" representing the singular object.
对于那个对象"我们再次使用 $indexOfArray
实际获得匹配的索引,因为我们知道该项目在那里",并使用它通过 $arrayElemAt
和 $add
到它以递增.
For that "object" we again use the $indexOfArray
to actually get the matched index since we know that the item "is there", and use that to extract the current "visited"
value from that entry via $arrayElemAt
and $add
to it in order to increment.
在 else"
的情况下,我们只需添加一个数组"即可.作为对象"它只有一个默认的 visited"
值 1
.使用这两种情况可以有效地累积数组中的唯一值以输出.
In the "else"
case we simply add an "array" as an "object" which just has a default "visited"
value of 1
. Using both those cases effectively accumulates unique values within the array to output.
在后一个版本中,我们只是$unwind代码>
数组并使用连续的$group
阶段以首先计数"在唯一的内部条目上,然后重新构造数组"变成类似的形式.
In the latter version, we just $unwind
the array and use successive $group
stages in order to first "count" on the unique inner entries, and then "re-construct the array" into the similar form.
使用$unwind
看起来简单得多,但由于它实际上是为每个数组条目获取文档的副本,因此这实际上增加了处理的相当大的开销.在现代版本中,通常有数组运算符,这意味着您不需要使用它,除非您的意图是跨文档累积".所以如果你真的需要 $group
来自内部"的键值一个数组,那么这就是你真正需要使用它的地方.
Using $unwind
looks far more simple, but since what it actually does is take a copy of the document for every array entry, then this actually adds considerable overhead to processing. In modern versions there are generally array operators which mean you don't need to use this unless your intention is to "accumulate across documents". So if you actually need to $group
on a value of a key from "inside" an array, then that is where you actually do need to use it.
至于 变量"
那么我们可以简单地使用 $filter
再次在此处获取匹配的 Budget"
条目.我们将此作为 $map
的输入 允许重新整形"的运算符;数组内容.我们主要希望这样您就可以获取 "values"
的内容(一旦您将其全部设为数字)并使用 $avg
运算符,它提供了字段路径符号"直接形成数组值,因为它实际上可以从这样的输入返回结果.
As for the "variables"
then we can simply use the $filter
again here to get the matching "Budget"
entry. We do this as the input to the $map
operator which allows "re-shaping" of the array content. We mainly want that so you can take the content of the "values"
( once you make it all numeric ) and use the $avg
operator, which is supplied that "field path notation" form directly to the array values because it can in fact return a result from such an input.
这通常会让您浏览几乎所有主要的数组运算符".聚合管道(不包括设置"运算符)都在单个管道阶段中.
That generally makes the tour of pretty much ALL of the main "array operators" for the aggregation pipeline ( excluding the "set" operators ) all within a single pipeline stage.
也不要忘记你总是想要$match 与常规 Query运营商作为第一阶段"任何聚合管道,以便只选择您需要的文档.最好使用索引.
Also don't ever forget that you just about always want to $match
with regular Query Operators as the "very first stage" of any aggregation pipeline in order to just select the documents you need. Ideally using an index.
替代人员正在处理客户端代码中的文档.通常不推荐这样做,因为上述所有方法都表明它们实际上减少"了从服务器返回的内容,通常是服务器聚合"的点.
Alternates are working through the documents in client code. It generally would not be recommended since all methods above show they actually "reduce" the content as returned from the server, as is generally the point of "server aggregations".
它可能"由于基于文档",因此成为可能自然,使用 $unwind
和客户端处理可能需要更多的时间,更大的结果集可能是一个选项,但我认为它更有可能
It "may" be possible due to the "document based" nature that larger result sets may take considerably more time using $unwind
and client processing could be an option, but I would consider it far more likely
下面是一个清单,演示了如何在返回结果时对游标流应用变换,做同样的事情.有三个演示版本的转换,显示完全"与上面相同的逻辑,使用 lodash
方法进行累加的实现,以及自然"的Map
实现上的累积:
Below is a listing that demonstrates applying a transform to the cursor stream as results are returned doing the same thing. There are three demonstrated versions of the transform, showing "exactly" the same logic as above, a implementation with lodash
methods for accumulation, and a "natural" accumulation on the Map
implementation:
const { MongoClient } = require('mongodb');
const { chain } = require('lodash');
const uri = 'mongodb://localhost:27017';
const opts = { useNewUrlParser: true };
const log = data => console.log(JSON.stringify(data, undefined, 2));
const transform = ({ cities, variables, ...d }) => ({
...d,
cities: cities.reduce((o,{ _id, name }) =>
(o.map(i => i._id).indexOf(_id) != -1)
? [
...o.filter(i => i._id != _id),
{ _id, name, visited: o.find(e => e._id === _id).visited + 1 }
]
: [ ...o, { _id, name, visited: 1 } ]
, []).sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const alternate = ({ cities, variables, ...d }) => ({
...d,
cities: chain(cities)
.groupBy("_id")
.toPairs()
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited)
.value(),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
const natural = ({ cities, variables, ...d }) => ({
...d,
cities: [
...cities
.reduce((o,{ _id, name }) => o.set(_id,
[ ...(o.has(_id) ? o.get(_id) : []), { _id, name } ]), new Map())
.entries()
]
.map(([k,v]) =>
({
...v.reduce((o,{ _id, name }) => ({ ...o, _id, name }),{}),
visited: v.length
})
)
.sort((a,b) => b.visited - a.visited),
variables: variables.filter(v => v.name === "Budget")
.map(({ values, additionalData, ...v }) => ({
...v,
values: (values != undefined)
? values.reduce((o,e) => o + e.value, 0) / values.length
: 0
}))
});
(async function() {
try {
const client = await MongoClient.connect(uri, opts);
let db = client.db('test');
let coll = db.collection('junk');
let cursor = coll.find().map(natural);
while (await cursor.hasNext()) {
let doc = await cursor.next();
log(doc);
}
client.close();
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()
这篇关于聚合累积内部对象的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!