Skip to content

1236. Web Crawler 👎

  • Time: $O(n)$
  • Space: $O(n)$
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * class HtmlParser {
 *  public:
 *   vector<string> getUrls(string url);
 * };
 */

class Solution {
 public:
  vector<string> crawl(string startUrl, HtmlParser htmlParser) {
    queue<string> q{{startUrl}};
    unordered_set<string> seen{{startUrl}};
    const string& hostname = getHostname(startUrl);

    while (!q.empty()) {
      const string currUrl = q.front();
      q.pop();
      for (const string& url : htmlParser.getUrls(currUrl)) {
        if (seen.contains(url))
          continue;
        if (url.find(hostname) != string::npos) {
          q.push(url);
          seen.insert(url);
        }
      }
    }

    return {seen.begin(), seen.end()};
  }

 private:
  string getHostname(const string& url) {
    const int firstSlash = url.find_first_of('/');
    const int thirdSlash = url.find_first_of('/', firstSlash + 2);
    return url.substr(firstSlash + 2, thirdSlash - firstSlash - 2);
  }
};
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/**
 * // This is the HtmlParser's API interface.
 * // You should not implement it, or speculate about its implementation
 * interface HtmlParser {
 *   public List<String> getUrls(String url) {}
 * }
 */

class Solution {
  public List<String> crawl(String startUrl, HtmlParser htmlParser) {
    Queue<String> q = new ArrayDeque<>(List.of(startUrl));
    Set<String> seen = new HashSet<>(Arrays.asList(startUrl));
    final String hostname = startUrl.split("/")[2];

    while (!q.isEmpty()) {
      final String currUrl = q.poll();
      for (final String url : htmlParser.getUrls(currUrl)) {
        if (seen.contains(url))
          continue;
        if (url.contains(hostname)) {
          q.offer(url);
          seen.add(url);
        }
      }
    }

    return new ArrayList<>(seen);
  }
}
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# """
# This is HtmlParser's API interface.
# You should not implement it, or speculate about its implementation
# """
# Class HtmlParser(object):
#   def getUrls(self, url: str) -> list[str]:

class Solution:
  def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> list[str]:
    q = collections.deque([startUrl])
    seen = {startUrl}
    hostname = startUrl.split('/')[2]

    while q:
      currUrl = q.popleft()
      for url in htmlParser.getUrls(currUrl):
        if url in seen:
          continue
        if hostname in url:
          q.append(url)
          seen.add(url)

    return seen