I have a large dataset with documents that sometimes cross-reference each other, sometimes do not. Before I can mapreduce based on those cross references, I have to set the array of cross-references to be that same for every value in the cross reference.
I use this in the shell function to consolidate those arrays:
function fixArray2() {
var counter = 0;
// I only want the xref for each field, I don't even want the id
var cursor = db.catalog.find({}, {xref: true, _id: false});
// I don't want to init this inside the loop, worried about memory leaks
var consolidatedArray = [];
while (cursor.hasNext()) {
var xref1 = cursor.next().xref;
// first pass: create a consolidated array when the cross references match
var limitedCursor1 = db.catalog.find({"name":{$in:xref1}});
while (limitedCursor1.hasNext()) {
var doc1 = limitedCursor1.next();
consolidatedArray = consolidatedArray.concat(doc1.xref);
}
consolidatedArray = consolidatedArray.unique();
// now that we have the consolidated array, reset the xref field of the object to it
for (var i=0; i<consolidatedArray.length; i++) {
db.catalog.update({name:consolidatedArray[i]},{$set:{xref: consolidatedArray}},false, true);
}
consolidatedArray.length = 0;
counter++;
if (counter % 1000 == 0) {
print("Processed " + counter + " documents.");
}
}
}
It works, but I have to run it fairly often. Can anyone suggest improvements?
If you do the work up front when writing the documents to the collection you may be able to avoid doing this map-reduce where you do the work at a later time.
Therefore, get the list of documents that should be cross referenced and write them with the document upon insertion. Update as needed, when a document is removed or no longer references the other for example.