This week is based on Lessons 3 and 4 of the Udacity course CS101 "Introduction to Computer Science". It would be best if you sign up and watch these lectures.
We will cover:
Check out Subsection 5.5. "Dictionaries" of the Python Tutorial.
Also, check out Subsection 4.4 "Dictionaries" of the Programming Knights book. Here is the link to Chapter 4.
We seek to obtain a list with all the URLs instead of printing out all URLs.
def get_next_target(s):
start_link = s.find('<a href=')
if start_link == -1:
return None, 0
start_quote = s.find('"', start_link)
end_quote = s.find('"', start_quote + 1)
url = s[start_quote + 1 : end_quote]
return url, end_quote
def print_all_links(page):
while True:
url, endpos = get_next_target(page)
if url:
print(url)
page = page[endpos:]
else:
break
We seek to obtain a list with all the URLs instead of printing out all URLs.
def print_all_links(page):
# initialize varibable links
links =
while True:
url, endpos = get_next_target(page)
if url:
print(url) # change this line
page = page[endpos:]
else:
break
# complete the return statement
return
We seek to obtain a list with all the URLs instead of printing out all URLs.
def print_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
# start with tocrawl = [seed]
# crawled = []
# while there are more page tocrawl:
# pick a page from tocrawl
# add that page to crawled
# add all the link targets on this page to tocrawl
# return crawled
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = # initialize correctly
crawled = # initialize correctly
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = [seed]
crawled = []
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = # retrieve the last URL from the list tocrawl and assign it to the variable URL
# if you use the correct list operation,
# you can remove this URL from the list tocrawl at the same time
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = tocrawl.pop()
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = tocrawl.pop()
if <test>: # crawl only if we have not yet crawled this URL
# fill in the correct test
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
# don't worry about the rest of the code
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.
import urllib.request
def get_page(url):
response = urllib.request.urlopen(url)
return response.read().decode("utf-8")
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
# fill in the missing code using
# the functions union, get_all_links, get_page
return crawled
print(crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html'))
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.
import urllib.request
def get_page(url):
response = urllib.request.urlopen(url)
return response.read().decode("utf-8")
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawl_web(seed):
tocrawl = [seed]
crawled = []
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
union(tocrawl, get_all_links(get_page(url)))
crawled.append(url)
return crawled
def main():
print(crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html'))
main()
We will use the package networkx. Check out the networkx tutorial.
# Define a procedure crawl_web that takes
# as input a seed page url, and outputs a list of all the urls that
# can be reached by following links by starting from the seed page.
import urllib.request
import networkx as nx # package for working with graphs
# copy code from previous slide
# omitted for brevity
def crawl_web(seed):
tocrawl = [seed]
crawled = []
# initialize G to be an empty directed graph
G=nx.DiGraph()
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
all_links = get_all_links(get_page(url))
union(tocrawl, all_links)
crawled.append(url)
# add a node to the graph G and name it using the value in url
G.add_node(url)
# for each link URL u in the web page corresponding to url,
# add the directed edge of the form current URL -> link URL
for u in all_links:
G.add_edge(url, u) # to name the edge, use the values stored in the variables url and u
return crawled, G
def main():
crawled, G = crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html')
print(crawled)
nx.draw(G)
main()
# Define a procedure, add_to_index,
# that takes 3 inputs:
# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String
# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.
# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword,[url]])
# Define a procedure, lookup,
# that takes two inputs:
# - an index
# - keyword
# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
# Define a procedure, add_page_to_index,
# that takes three inputs:
# - index
# - url (String)
# - content (String)
# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.
def add_page_to_index(index,url,content):
keywords = content.split()
for keyword in keywords:
add_to_index(index,keyword,url)
# Define a procedure, add_to_index,
# that takes 3 inputs:
# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String
# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.
# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword,[url]])
# Define a procedure, lookup,
# that takes two inputs:
# - an index
# - keyword
# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
# Define a procedure, add_page_to_index,
# that takes three inputs:
# - index
# - url (String)
# - content (String)
# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.
def add_page_to_index(index,url,content):
keywords = content.split()
for keyword in keywords:
add_to_index(index,keyword,url)
index = []
add_to_index(index,'ucf','http://ucf.edu')
add_to_index(index,'quantum','http://mit.edu')
add_to_index(index,'computing','http://mit.edu')
print(index)
print(lookup(index,'ucf'))
print(lookup(index,'quantum'))
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
content = get_page(url)
add_page_to_index(index, url, content)
union(tocrawl, get_all_links(content))
crawled.append(url)
return index
import urllib.request
# Define a procedure, add_to_index,
# that takes 3 inputs:
# - an index: [[,[,...]],...]
# - a keyword: String
# - a url: String
# If the keyword is already
# in the index, add the url
# to the list of urls associated
# with that keyword.
# If the keyword is not in the index,
# add an entry to the index: [keyword,[url]]
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword,[url]])
# Define a procedure, lookup,
# that takes two inputs:
# - an index
# - keyword
# The procedure should return a list
# of the urls associated
# with the keyword. If the keyword
# is not in the index, the procedure
# should return an empty list.
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
# Define a procedure, add_page_to_index,
# that takes three inputs:
# - index
# - url (String)
# - content (String)
# It should update the index to include
# all of the word occurences found in the
# page content by adding the url to the
# word's associated url list.
def add_page_to_index(index,url,content):
keywords = content.split()
for keyword in keywords:
add_to_index(index,keyword,url)
#
#
#
def get_page(url):
response = urllib.request.urlopen(url)
return response.read().decode("utf-8")
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def get_all_links(page):
links = []
while True:
url, endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawl_web(seed):
tocrawl = [seed]
crawled = []
index = []
while tocrawl:
url = tocrawl.pop()
if url not in crawled:
content = get_page(url)
add_page_to_index(index, url, content)
union(tocrawl, get_all_links(content))
crawled.append(url)
return index
# commented out definition of main on purpose so we can access index in the shell
# def main():
index = crawl_web('http://www.cs.ucf.edu/courses/cop3223/spr2014/section1/start_crawl.html')
print(index)
#main()
We will be using the Python module module time.
import time
def time_execution(code):
start = time.clock()
result = eval(code)
run_time = time.clock() - start
return result, run_time
# for instance, on my computer I get
# >>> time_execution('2+2')
# (4, 8.623211838437968e-05)
# the first element 4 is the result of the code '2+2' and
# the second element 8.623211838437968e-05 it takes to execute
# this code
We will be using the Python module module time.
import time
def time_execution(code):
start = time.clock()
result = eval(code)
run_time = time.clock() - start
return result, run_time
# for instance, on my computer I get
# >>> time_execution('2+2')
# (4, 8.623211838437968e-05)
# the first element 4 is the result of the code '2+2' and
# the second element 8.623211838437968e-05 it takes to execute
# this code
def add_to_index(index,keyword,url):
for entry in index:
if entry[0] == keyword:
entry[1].append(url)
return
index.append([keyword,[url]])
def lookup(index,keyword):
for entry in index:
if entry[0] == keyword:
return entry[1]
return []
def make_string(p):
s = ""
for e in p:
s = s + e
return s
# building a fake index
def make_big_index(size):
index = []
letters = ['a','a','a','a','a','a','a','a']
while len(index) < size:
word = make_string(letters)
add_to_index(index, word, 'fake')
for i in range(len(letters) - 1, 0, -1):
if letters[i] < 'z':
letters[i] = chr(ord(letters[i]) + 1)
break
else:
letters[i] = 'a'
return index
index10k = make_big_index(10000)
print("first element of our fake index: ", index10k[0])
print("last element of our fake index: ", index10k[-1])
# timing lookup
print(time_execution("lookup(index10k,'aaaaaaaa')"))
print(time_execution("lookup(index10k,'zzzzzzzz')"))