Reply -
Raw
from BeautifulSoup import BeautifulSoup
from urllib import urlopen, quote_plus
import urlparse
import time
def tuplelist_to_dict(tuplelist):
res = {}
for t in tuplelist:
res[t[0]] = t[1]
return res
v01 = {}
def get_list():
global v01
v01 = {}
_url = "http://www.nrk.no/nett-tv/bokstav/@/"
v01['start'] = time.time()
html = urlopen("http://www.w3.org/services/tidy?docAddr=%s&forceXML=on" %(quote_plus(_url))).read()
v01['got html'] = time.time()
html = html.replace("\n", "")
items = BeautifulSoup(html).find("div", "nettv-category").find("ul").findChildren("li")
res = []
v01['starting item parsing'] = time.time()
for item in items:
x = tuplelist_to_dict(item.find("img").attrs)
v_title = unicode(x['alt'])
v_image = unicode(urlparse.urljoin(_url, x['src']))
v_link = unicode(urlparse.urljoin(_url, tuplelist_to_dict(item.find("a").attrs)['href']))
v_summary = unicode(item.find("div", "summary").text)
res.append({'title': v_title, 'img': v_image, 'url': v_link, 'plot': v_summary})
#print v_title
#print v_image
#print v_link
#print v_summary
v01["returning results"] = time.time()
return res
#print unicode(items[0])
#print dir(items[0])
print get_list()
print "Starting".ljust(30), ":", str(v01['start'])
print "Got html".ljust(30), ":", "+" + str(v01['got html'] - v01['start'])
print "Starting item parsing".ljust(30), ":", "+" + str(v01['starting item parsing'] - v01['got html'])
print "Returning results".ljust(30), ":", "+" + str(v01['returning results'] - v01['starting item parsing'])