Skip to content

1242. Web Crawler Multithreaded 👍

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * class HtmlParser {
 *  public:
 *   vector<string> getUrls(string url);
 * };
 */

class Solution {
 public:
  vector<string> crawl(string startUrl, HtmlParser htmlParser) {
    queue<string> q{{startUrl}};
    unordered_set<string> seen{{startUrl}};
    const string& hostname = getHostname(startUrl);
    const int nThreads = std::thread::hardware_concurrency();
    vector<thread> threads;
    std::mutex mtx;
    std::condition_variable cv;

    auto t = [&]() {
      while (true) {
        unique_lock<mutex> lock(mtx);
        cv.wait_for(lock, 30ms, [&]() { return q.size(); });
        if (q.empty())
          return;
        auto cur = q.front();
        q.pop();
        lock.unlock();
        const vector<string> urls = htmlParser.getUrls(cur);
        lock.lock();
        for (const string& url : urls) {
          if (seen.count(url))
            continue;
          if (url.find(hostname) != string::npos) {
            q.push(url);
            seen.insert(url);
          }
        }
        lock.unlock();
        cv.notify_all();
      }
    };

    for (int i = 0; i < nThreads; ++i)
      threads.emplace_back(t);

    for (std::thread& t : threads)
      t.join();

    return {seen.begin(), seen.end()};
  }

 private:
  string getHostname(const string& url) {
    const int firstSlash = url.find_first_of('/');
    const int thirdSlash = url.find_first_of('/', firstSlash + 2);
    return url.substr(firstSlash + 2, thirdSlash - firstSlash - 2);
  }
};