Posts Issued in August, 2014

week 4 covered:

  • GRAPH SEARCH AND CONNECTIVITY

Programming assignment: The file contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

The file contains the edges of a directed graph. Vertices are labeled as positive integers from 1 to 875714. Every row indicates an edge, the vertex label in first column is the tail and the vertex label in second column is the head (recall the graph is directed, and the edges are directed from the first column vertex to the second column vertex). So for example, the 11th row looks liks : "2 47646". This just means that the vertex with label 2 has an outgoing edge to the vertex with label 47646

Your task is to code up the algorithm from the video lectures for computing strongly connected components (SCCs), and to run this algorithm on the given graph.

Output Format: You should output the sizes of the 5 largest SCCs in the given graph, in decreasing order of sizes, separated by commas (avoid any spaces). So if your algorithm computes the sizes of the five largest SCCs to be 500, 400, 300, 200 and 100, then your answer should be "500,400,300,200,100". If your algorithm finds less than 5 SCCs, then write 0 for the remaining terms. Thus, if your algorithm computes only 3 SCCs whose sizes are 400, 300, and 100, then your answer should be "400,300,100,0,0".

WARNING: This is the most challenging programming assignment of the course. Because of the size of the graph you may have to manage memory carefully. The best way to do this depends on your programming language and environment, and we strongly suggest that you exchange tips for doing this on the discussion forums.

import collections
import sys
import random
 
def return_emptylist():
  return []
 
def return_false():
  return False
 
def ParseGraph(filename):
    #algo4SCC.txt
  """Parse a graph into a list of edges for programming Q.4
 
Args:
- filename: the on-disk graph representation
Returns:
- edges = [(vertex_1, vertex_2), ...]
"""
  edges = []
  for l in open(filename):
    fields = [int(f) for f in l.split()]
    edges.append(tuple(fields))
 
  adjacency = collections.defaultdict(return_emptylist)
  reverse_adjacency = collections.defaultdict(return_emptylist)
  for e in edges:
    adjacency[e[0]] = adjacency[e[0]] + [e]
    reverse_adjacency[e[1]] = reverse_adjacency[e[1]] + [(e[1], e[0])]
 
  return adjacency, reverse_adjacency, edges
 
t = 0
s = 0
finishing = {}
leader = {}
explored = collections.defaultdict(return_false)
def ResetState():
  global t, s, finishing, leader, explored
  t = 0
  s = 0
  finishing = {}
  leader = {}
  explored = collections.defaultdict(return_false)
 
def DFSLoop(edges, labeling, reversed = False):
  global s
  for i in labeling:
    if not explored[i]:
      s = i
      DFS(edges, i, reversed)
 
forward_adjacency = {}
reverse_adjacency = {}
def DFS(edges, start, reversed = False):
  global t
  if reversed:
    adjacency = reverse_adjacency
  else:
    adjacency = forward_adjacency
 
  # Iterative (i.e. manually managing a stack) solution.
  stack = []
  stack.append((start, 1))
 
  while len(stack) > 0:
    current, phase = stack.pop()
    if phase == 1:
      explored[current] = True
      leader[current] = s
      edge_found = False
      for edge in adjacency[current]:
        if not explored[edge[1]]:
          stack.append((current, 1))
          stack.append((edge[1], 1))
          edge_found = True
          break
      if not edge_found:
        stack.append((current, 2))
    if phase == 2:
      t += 1
      finishing[current] = t
      sys.stderr.write('Finished %s\n' % current)
 
forward_adjacency, reverse_adjacency, edges = ParseGraph("algo4SCC.txt")
 
sys.stderr.write('Graph parsed\n')
 
num_nodes = max([e[0] for e in edges] + [e[1] for e in edges])
labeling = xrange(num_nodes, 0, -1)
DFSLoop(edges, labeling, True)
 
sys.stderr.write('Reverse DFSLoop done\n')
 
inverse_finishing = dict((v, k) for k, v in finishing.iteritems())
finish_labeling = [inverse_finishing[i] for i in xrange(num_nodes, 0, -1)]
 
ResetState()
DFSLoop(edges, finish_labeling)
 
sys.stderr.write('Forward DFSLoop done\n')
 
sccs = {}
for i in leader:
  if leader[i] not in sccs:
    sccs[leader[i]] = [i]
  else:
    sccs[leader[i]].append(i)
 
for i in sccs:
  print '%s\t%s' % (i, len(sccs[i]))

Week 3 covered:

  • PROBABILITY REVIEW
  • LINEAR-TIME SELECTION
  • GRAPHS AND THE CONTRACTION ALGORITHM

Programming assignment: The file contains the adjacency list representation of a simple undirected graph. Your task is to code up and run the randomized contraction algorithm for the min cut problem and use it on the above graph to compute the min cut.

The file contains the adjacency list representation of a simple undirected graph. There are 200 vertices labeled 1 to 200. The first column in the file represents the vertex label, and the particular row (other entries except the first column) tells all the vertices that the vertex is adjacent to. So for example, the 6th row looks like : "6 155 56 52 120 ......". This just means that the vertex with label 6 is adjacent to (i.e., shares an edge with) the vertices with labels 155,56,52,120,......,etc

import sys
import random
filename = "algo3_GRAPHmincut.txt"
def ParseGraph(filename):
  """Parse a graph into adjacency list format per programming Q.3
 
Args:
- filename: the on-disk graph representation
Returns:
(vertices, edges) where
vertices = [vertex_1, vertex_2, ...]
edges = [(vertex_1, vertex_2), ...]
"""
  vertices = []
  edges = set([])
 
  for l in open(filename):
    fields = [int(f) for f in l.split()]
    vertex = fields.pop(0)
    incident = [tuple(sorted([vertex, f])) for f in fields]
    vertices.append(vertex)
    edges.update(incident)
 
  return vertices, list(edges)
 
def RandomContraction(vertices, edges):
  while len(vertices) > 2:
    edge = random.choice(edges)
    a, b = edge
    vertices.remove(b)
    new_edges = []
    for e in edges:
      if e == edge:
        continue
      if b in e:
        if e[0] == b:
          other = e[1]
        if e[1] == b:
          other = e[0]
        e = tuple(sorted([a, other]))
      new_edges.append(e)
    edges = new_edges
 
  return vertices, edges
 
vertices, edges = ParseGraph(filename)
 
minimum = sys.maxint
for i in range(0, 1000):
  v, e = RandomContraction(vertices[:], edges[:])
  #print v, len(e)
  if len(e) < minimum:
    minimum = len(e)
 
print "min cut: %d" % minimum

Your task is to code up and run the randomized contraction algorithm for the min cut problem and use it on the above graph to compute the min cut. (HINT: Note that you'll have to figure out an implementation of edge contractions. Initially, you might want to do this naively, creating a new graph from the old every time there's an edge contraction. But you should also think about more efficient implementations.) (WARNING: As per the video lectures, please make sure to run the algorithm many times with different random seeds, and remember the smallest cut that you ever find.) Write your numeric answer in the space provided. So e.g., if your answer is 5, just type 5 in the space provided.

Week1 topics:

  • INTRODUCTION
  • ASYMPTOTIC ANALYSIS
  • DIVIDE & CONQUER ALGORITHMS

Programming assignment: The file contains all of the 100,000 integers between 1 and 100,000 (inclusive) in some order, with no integer repeated. Your task is to compute the number of inversions in the file given, where the ith row of the file indicates the ith entry of an array.

Question 1 Download the text file here. (Right click and save link as)

This file contains all of the 100,000 integers between 1 and 100,000 (inclusive) in some order, with no integer repeated.

Your task is to compute the number of inversions in the file given, where the ith row of the file indicates the ith entry of an array. Because of the large size of this array, you should implement the fast divide-and-conquer algorithm covered in the video lectures. The numeric answer for the given input file should be typed in the space below. So if your answer is 1198233847, then just type 1198233847 in the space provided without any space / commas / any other punctuation marks. You can make up to 5 attempts, and we'll use the best one for grading. (We do not require you to submit your code, so feel free to use any programming language you want --- just type the final numeric answer in the following space.)

#! /usr/bin/python
    # coding: UTF-8
 
import sys
 
def main():
    a = []
    tweet_file = open("algo.txt")
    for line in tweet_file:
        a.append(int(line))
    print count_inversions(a, 0, len(a) - 1)
 
def count_inversions(a, p, r):
    inversions = 0
    if p < r:
        q = (p + r) / 2
        inversions += count_inversions(a, p, q)
        inversions += count_inversions(a, q+1, r)
        inversions += merge_inversions(a, p, q, r)
    return inversions
 
INFINITY = 999999
 
def merge_inversions(a, p, q, r):
    n1 = q - p + 1
    n2 = r - q
    L = []
    R = []
    for i in range(n1):
        L.append(a[p+i])
    for j in range(n2):
        R.append(a[q+j+1])
    L.append(INFINITY)
    R.append(INFINITY)
    i = 0
    j = 0
    inversions = 0
    counted = False
    for k in range(p, r+1):
        if not counted and R[j] < L[i]:
            inversions += n1 - i
            counted = True
        if L[i] <= R[j]:
            a[k] = L[i]
            i += 1
        else:
            a[k] = R[j]
            j += 1
            counted = False
    return inversions
 
if __name__ == '__main__':
    main()

Week covered:

  • THE MASTER METHOD
  • QUICKSORT - ALGORITHM
  • QUICKSORT - ANALYSIS

Programming assignment: The file contains all of the integers between 1 and 10,000 (inclusive, with no repeats) in unsorted order. Your task is to compute the total number of comparisons used to sort the given input file by QuickSort.

The file contains all of the integers between 1 and 10,000 (inclusive, with no repeats) in unsorted order. The integer in the ith row of the file gives you the ith entry of an input array.

Your task is to compute the total number of comparisons used to sort the given input file by QuickSort. As you know, the number of comparisons depends on which elements are chosen as pivots, so we'll ask you to explore three different pivoting rules. You should not count comparisons one-by-one. Rather, when there is a recursive call on a subarray of length m, you should simply add m−1 to your running total of comparisons. (This is because the pivot element is compared to each of the other m−1 elements in the subarray in this recursive call.)

WARNING: The Partition subroutine can be implemented in several different ways, and different implementations can give you differing numbers of comparisons. For this problem, you should implement the Partition subroutine exactly as it is described in the video lectures (otherwise you might get the wrong answer).

DIRECTIONS FOR THIS PROBLEM:

For the first part of the programming assignment, you should always use the first element of the array as the pivot element.

HOW TO GIVE US YOUR ANSWER: Type the numeric answer in the space provided.

import os
 
def quick_sort (arr):
    def swap(l, a,b):
        temp = l[a]
        l[a] = l[b]
        l[b] = temp
        return l
    def find_median(x):
        candi = sorted([x[0],x[-1],x[int(ceil(len(x)/2.))-1]])[1]
        return swap(x, 0, x.index(candi))
 
    if len(arr)==0 or len(arr)==1:
        return arr
    arr = find_median(arr)
    pivot = arr[0]
    i = 1
    for j in range(i,len(arr)):
        if arr[j]<pivot:
            arr = swap(arr, i, j)
            i += 1
    i -= 1
    arr = swap(arr, 0, i)
    return quick_sort(arr[:i])+[arr[i]]+quick_sort(arr[i+1:])
 
from math import ceil
def quick_sort_count (arr, index, median=False):
    def swap(l, a,b):
        temp = l[a]
        l[a] = l[b]
        l[b] = temp
        return l
    def find_median(x):
        candi = sorted([x[0],x[-1],x[int(ceil(len(x)/2.))-1]])[1]
        return swap(x, 0, x.index(candi))
 
    if len(arr)==0 or len(arr)==1:
        return arr, 0
    if index != 0:
        arr = swap(arr, index, 0)
    if median:
        arr = find_median(arr)
    pivot = arr[0]
    i = 1
    for j in range(i,len(arr)):
        if arr[j]<pivot:
            arr = swap(arr, i, j)
            i += 1
    arr = swap(arr, 0, i-1)
    arr[:i-1], t1 = quick_sort_count(arr[:i-1], index, median)
    arr[i:], t2 = quick_sort_count(arr[i:], index, median)
    return arr, len(arr)+t1+t2-1
 
def test (l):
    from random import shuffle
    x=range(l)
 
    # Test the quick_sort_count
# a = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
# b = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
# c = [1, 11, 5, 15, 2, 12, 9, 99, 77, 0]
# d = [999, 3, 2, 98, 765, 8, 14, 15, 16, 88, 145, 100]
# e = [1, 11, 5, 15, 2, 999, 3, 2, 98, 765, 8, 14, 15, 16, 88, 145, 100, 12, 9, 99, 77, 0]
# arr, result = quick_sort_count(x, -1)
# print "The overall comparison # is ", result
#
# arr, result = quick_sort_count(a, -1)
# print "The overall comparison # is ", result
# arr, result = quick_sort_count(b, -1)
# print "The overall comparison # is ", result
# arr, result = quick_sort_count(c, -1)
# print "The overall comparison # is ", result
# arr, result = quick_sort_count(d, -1)
# print "The overall comparison # is ", result
# arr, result = quick_sort_count(e, -1)
# print "The overall comparison # is ", result
 
    # Test the quick_sort
    shuffle(x)
    print "The original data set is ", x
    print "The sorted array is ", quick_sort(x)
 
if __name__ == "__main__":
    test(20)
 
 
def run_quicksort (arr, index, median):
    _,result = quick_sort_count(arr, index, median)
    print result
 
 
def main ():
    arr = []
    #sent_file = open("D:\Python27\algoQUICKSORT.txt",)
    sent_file = open("algoQUICKSORT.txt")
    with sent_file as datafile:
        for row in datafile:
            arr.append(int(row))
    # Because the quick_sort is in-place sorting,
    # arr should be duplicated for different pivot choices
    arr2 = list(arr)
    arr3 = list(arr)
    _,result = quick_sort_count(arr, 0)
    print "Assign the 1st element as pivot:", result
    _,result = quick_sort_count(arr2, -1)
    print "Assign the last element as pivot:", result
    _,result = quick_sort_count(arr3, 0, True)
    print "Assign the 'median' element as pivot:", result
 
 
if __name__=="__main__":
    main()

Coding from Question 1 - 10 covering the whole course.

Final: Question 1

Please download the Enron email dataset enron.zip, unzip it and then restore it using mongorestore. It should restore to a collection called "messages" in a database called "enron". Note that this is an abbreviated version of the full corpus. There should be 120,477 documents after restore.

Inspect a few of the documents to get a basic understanding of the structure. Enron was an American corporation that engaged in a widespread accounting fraud and subsequently failed.

In this dataset, each document is an email message. Like all Email messages, there is one sender but there can be multiple recipients.

Construct a query to calculate the number of messages sent by Andrew Fastow, CFO, to Jeff Skilling, the president. Andrew Fastow's email addess was andrew.fastow@enron.com. Jeff Skilling's email was jeff.skilling@enron.com.

For reference, the number of email messages from Andrew Fastow to John Lavorato (john.lavorato@enron.com) was 1.

db.messages.aggregate({$project:{'headers.From':1, 'headers.To':1}},{$match:{'headers.From':'andrew.fastow@enron.com'}},{$unwind:'$headers.To'},{$match:{'headers.To':'jeff.skilling@enron.com'}},{$group:{_id:{from:'$headers.From', to:'$headers.To'},count:{$sum:1}}})

Please use the Enron dataset you imported for the previous problem. For this question you will use the aggregation framework to figure out pairs of people that tend to communicate a lot. To do this, you will need to unwind the To list for each message.

This problem is a little tricky because a recipient may appear more than once in the To list for a message. You will need to fix that in a stage of the aggregation before doing your grouping and counting of (sender, recipient) pairs.

Which pair of people have the greatest number of messages in the dataset?

var client = require('mongodb').MongoClient;
 
var pairs = [
    {from:'susan.mara@enron.com', to:'jeff.dasovich@enron.com'},
    {from:'susan.mara@enron.com', to:'richard.shapiro@enron.com'},
    {from:'soblander@carrfut.com', to:'soblander@carrfut.com'},
    {from:'susan.mara@enron.com', to:'james.steffes@enron.com'},
    {from:'evelyn.metoyer@enron.com', to:'kate.symes@enron.com'},
    {from:'susan.mara@enron.com', to:'alan.comnes@enron.com'}
];
 
client.connect('mongodb://localhost:27017/enron', function(err, db) {
    if (err) throw err;
 
    var count = pairs.length;   // # of pairs to run aggregate query over
 
    pairs.forEach(function(pair, index, array) {
        var pipeline = [
            {$project:{'headers.From':1, 'headers.To':1}},
            {$match:{'headers.From': pair.from}},
            {$unwind:'$headers.To'},
            {$match:{'headers.To': pair.to}},
            {$group:{
                //_id:{id: '$_id', from:'$headers.From'}, 
                _id:{id: '$_id', from:'$headers.From'}, 
                to:{$addToSet: '$headers.To'}
            }},
            {$unwind:'$to'},
            {$group:{
                _id:{from:'$_id.from', to:'$to'},
                count: {$sum: 1}
            }},
        ];
 
        db.collection('messages', function(err, collection) {
            if (err) throw err;
 
            collection.aggregate(pipeline, function(err, result) {
                console.dir(result);
 
                // close DB after results from all queries are complete
                count--;
                if (count == 0) db.close()
            });
        });
    });
});

Final: Question 3 In this problem you will update a document in the Enron dataset to illustrate your mastery of updating documents from the shell.

Please add the email address "mrpotatohead@mongodb.com" to the list of addresses in the "headers.To" array for the document with "headers.Message-ID" of "<8147308.1075851042335.JavaMail.evans@thyme>"

After you have completed that task, please download Final3.zip and run final3-validate.js to get the validation code and put it in the box below without any extra spaces. The validation script assumes that it is connecting to a simple mongo instance on the standard port on localhost.

var MongoClient = require('mongodb').MongoClient;
 
MongoClient.connect('mongodb://localhost:27017/enron', function(err, db) {
    if(err) throw err;
 
    db.collection('messages', function(err, collection) {
        if(err) throw err;
 
        collection.update(
            {'headers.Message-ID':'<8147308.1075851042335.JavaMail.evans@thyme>'},
            { $push : {'headers.To': 'mrpotatohead@mongodb.com'}},
        function(err, result){
            if(err) throw err;
            console.log(result);
            return db.close();
        });
 
    });
});

Final: Question 4

Enhancing the Blog to support viewers liking certain comments In this problem, you will be enhancing the blog project to support users liking certain comments and the like counts showing up the in the permalink page.

Start by downloading the code in Final4.zip and loading up the blog dataset posts.json. The user interface has already been implemented for you. It's not fancy. The /post URL shows the like counts next to each comment and displays a Like button that you can click on. That Like button POSTS to the /like URL on the blog, makes the necessary changes to the database state (you are implementing this), and then redirects the browser back to the permalink page.

This full round trip and redisplay of the entire web page is not how you would implement liking in a modern web app, but it makes it easier for us to reason about, so we will go with it.

Your job is to search the code for the string "XXX work here" and make any necessary changes. You can choose whatever schema you want, but you should note that the entry_template makes some assumptions about the how the like value will be encoded and if you go with a different convention than it assumes, you will need to make some adjustments.

The validation script does not look at the database. It looks at the blog.

The validation script, final4-validate.js, will fetch your blog, go to the first post's permalink page and attempt to increment the vote count.

/* The PostsDAO must be constructed with a connected database object */
function PostsDAO(db) {
    "use strict";
 
    /* If this constructor is called without the "new" operator, "this" points
* to the global object. Log a warning and call it correctly. */
    if (false === (this instanceof PostsDAO)) {
        console.log('Warning: PostsDAO constructor called without "new" operator');
        return new PostsDAO(db);
    }
 
    var posts = db.collection("posts");
 
    this.insertEntry = function (title, body, tags, author, callback) {
        "use strict";
        console.log("inserting blog entry" + title + body);
 
        // fix up the permalink to not include whitespace
        var permalink = title.replace( /\s/g, '_' );
        permalink = permalink.replace( /\W/g, '' );
 
        // Build a new post
        var post = {"title": title,
                "author": author,
                "body": body,
                "permalink":permalink,
                "tags": tags,
                "comments": [],
                "date": new Date()}
 
        // now insert the post
        posts.insert(post, function (err, result) {
            "use strict";
 
            if (err) return callback(err, null);
 
            console.log("Inserted new post");
            callback(err, permalink);
        });
    }
 
    this.getPosts = function(num, callback) {
        "use strict";
 
        posts.find().sort('date', -1).limit(num).toArray(function(err, items) {
            "use strict";
 
            if (err) return callback(err, null);
 
            console.log("Found " + items.length + " posts");
 
            callback(err, items);
        });
    }
 
    this.getPostsByTag = function(tag, num, callback) {
        "use strict";
 
        posts.find({ tags : tag }).sort('date', -1).limit(num).toArray(function(err, items) {
            "use strict";
 
            if (err) return callback(err, null);
 
            console.log("Found " + items.length + " posts");
 
            callback(err, items);
        });
    }
 
    this.getPostByPermalink = function(permalink, callback) {
        "use strict";
        posts.findOne({'permalink': permalink}, function(err, post) {
            "use strict";
 
            if (err) return callback(err, null);
 
            // XXX: Look here for final exam to see where we store "num_likes"
 
            // fix up likes values. set to zero if data is not present
            if (typeof post.comments === 'undefined') {
                post.comments = [];
            }
 
            // Each comment document in a post should have a "num_likes" entry, so we have to
            // iterate all the comments in the post to make sure that is the case
            for (var i = 0; i < post.comments.length; i++) {
                if (typeof post.comments[i].num_likes === 'undefined') {
                    post.comments[i].num_likes = 0;
                }
                post.comments[i].comment_ordinal = i;
            }
            callback(err, post);
        });
    }
 
    this.addComment = function(permalink, name, email, body, callback) {
        "use strict";
 
        var comment = {'author': name, 'body': body}
 
        if (email != "") {
            comment['email'] = email
        }
 
        posts.update({'permalink': permalink}, {'$push': {'comments': comment}}, function(err, numModified) {
            "use strict";
 
            if (err) return callback(err, null);
 
            callback(err, numModified);
        });
    }
 
    this.incrementLikes = function(permalink, comment_ordinal, callback) {
        "use strict";
 
        // The "comment_ordinal" argument specifies which comment in the post we are looking at
        // Here is an example of how to build a selector with the 'comment_ordinal' variable
        // We have to do it this way because a literal object with variables in field names such as:
        // { 'comments.' + comment_ordinal + '.author' : 'Frank' } is illegal Javascript.
        // var selector_example = {};
        // var comment_ordinal_example = 0;
        // selector_example['comments.' + comment_ordinal_example + '.author'] = 'Frank';
        // Now selector_example = { 'comments.0.author' : 'Frank' } which is a selector for the
        // string 'Frank' in the 'author' field of the first element of the 'comments' array (which
        // is zero indexed).
 
        // TODO (OLD): Final exam question - Increment the number of likes
        // callback(Error("incrementLikes NYI"), null);
 
        // ANSWER
        var selector = {};
        selector['comments.' + comment_ordinal + '.num_likes'] = 1;
        posts.update(
            {'permalink': permalink},
            { '$inc' : selector },
            function(err, post) {
                "use strict";
 
                if (err) return callback(err, null);
                console.dir(post);
                callback(err, post);
        });
    }
}
 
module.exports.PostsDAO = PostsDAO;

Final: Question 5

Suppose your have a collection fubar with the following indexes created:

[
    {
        "v" : 1,
        "key" : {
            "_id" : 1
        },
        "ns" : "test.fubar",
        "name" : "_id_"
    },
    {
        "v" : 1,
        "key" : {
            "a" : 1,
            "b" : 1
        },
        "ns" : "test.fubar",
        "name" : "a_1_b_1"
    },
    {
        "v" : 1,
        "key" : {
            "a" : 1,
            "c" : 1
        },
        "ns" : "test.fubar",
        "name" : "a_1_c_1"
    },
    {
        "v" : 1,
        "key" : {
            "c" : 1
        },
        "ns" : "test.fubar",
        "name" : "c_1"
    },
    {
        "v" : 1,
        "key" : {
            "a" : 1,
            "b" : 1,
            "c" : -1
        },
        "ns" : "test.fubar",
        "name" : "a_1_b_1_c_-1"
    }
]
 
Now suppose you want to run the following query against the collection.
 
db.fubar.find({'a':{'$lt':10000}, 'b':{'$gt': 5000}}, {'a':1, 'c':1}).sort({'c':-1})
 
Which of the following indexes could be used by MongoDB to assist in answering the query. Check all that apply. 
 
a_1_b_1
a_1_c_1
c_1
a_1
_b_1_c_-1

Final: Question 7

You have been tasked to cleanup a photosharing database. The database consists of two collections, albums, and images. Every image is supposed to be in an album, but there are orphan images that appear in no album. Here are some example documents (not from the collections you will be downloading).

> db.albums.findOne()
{
    "_id" : 67
    "images" : [
        4745,
        7651,
        15247,
        17517,
        17853,
        20529,
        22640,
        27299,
        27997,
        32930,
        35591,
        48969,
        52901,
        57320,
        96342,
        99705
    ]
}
 
> db.images.findOne()
{ "_id" : 99705, "height" : 480, "width" : 640, "tags" : [ "dogs", "kittens", "work" ] }

From the above, you can conclude that the image with _id = 99705 is in album 67. It is not an orphan.

Your task is to write a program to remove every image from the images collection that appears in no album. Or put another way, if an image does not appear in at least one album, it's an orphan and should be removed from the images collection.

Download and unzip Final7.zip and use mongoimport to import the collections in albums.json and images.json.

When you are done removing the orphan images from the collection, there should be 89,737 documents in the images collection. To prove you did it correctly, what are the total number of images with the tag 'kittens" after the removal of orphans? As as a sanity check, there are 49,932 images that are tagged 'kittens' before you remove the images. Hint: you might consider creating an index or two or your program will take a long time to run.

//# 1 code
use photoshare
 
db.albums.ensureIndex({'images':1});
var cur = db.images.find();
 
var j = 0;
while(cur.hasNext()){
doc = cur.next();
image_id = doc._id
 
b = db.albums.find({images : image_id}).count()
if(b == 0){
db.images.remove({_id:image_id})
j++;
}
}
 
//#2 code
var client = require('mongodb').MongoClient;
 
client.connect('mongodb://localhost:27017/photos', function(err, db) {
    if (err) throw err;
 
    /*
    var pipeline = [
        {$project:{'headers.From':1, 'headers.To':1}},
        {$match:{'headers.From': pair.from}},
        {$unwind:'$headers.To'},
        {$match:{'headers.To': pair.to}},
        {$group:{
            //_id:{id: '$_id', from:'$headers.From'}, 
            _id:{id: '$_id', from:'$headers.From'}, 
            to:{$addToSet: '$headers.To'}
        }},
        {$unwind:'$to'},
        {$group:{
            _id:{from:'$_id.from', to:'$to'},
            count: {$sum: 1}
        }},
    ];
    */
 
    /*
    db.collection('messages', function(err, collection) {
        if (err) throw err;
 
        collection.aggregate(pipeline, function(err, result) {
            console.dir(result);
 
            // close DB after results from all queries are complete
            count--;
            if (count == 0) db.close()
        });
    });
    */
 
    var albums = db.collection('albums');
 
    db.collection('images', function(err, images) {
        if (err) throw err;
 
        images.find({}, {'_id':true}, function(err, cursor) {
            if (err) throw err;
 
            var count = cursor.count(function(err, count) {
                console.dir('num images: ' + count);
 
                // iterate over each image
                cursor.each(function(err, item) {
                    if (item !== null) {
                        // attempt to find an album containing the photo, if not prune
                        albums.findOne({images:item._id}, function(err, doc) {
                            if (err) throw err;
 
                            if (doc == null) {
                                images.remove({'_id':item._id}, function(err, numRemoved) {
                                    if (err) throw err;
 
                                    count--;
                                    console.dir('count: ' + count);
                                    if (count == 0) db.close();
                                });
                            } else {
                                count--;
                                console.dir('count: ' + count);
                                if (count == 0) db.close();
                            }
                        });
                    }   
                });
            });
        })
    });
 
});
 
//kittens query
db.images.find({tags:'kittens'}).count()

Go to page: