#!c:/python25/python.exe
import datetime
##import re
import sqlite3
import urllib
import urlparse
from BeautifulSoup import BeautifulSoup
#
# DATE_MATCH regexp could be used in place of string split
#
## DATE_MATCH = re.compile (r"\S+\s(\d\d?)(?:st|nd|rd|th)\s(\S+)\s(\d{4})")
URL = "http://dev.goodtoread.org/whatsnew"
page = BeautifulSoup (urllib.urlopen (URL))
book_reviews = []
for whatsnew_date in page.findAll ("div", "whatsnew-date"):
weekday, day, month, year = whatsnew_date.h2.string.split ()
day = day[:-2]
date = datetime.datetime.strptime ("%s %s %s" % (day, month, year), "%d %B %Y").date ()
print date
for book_summary in whatsnew_date.findAll ("div", "book-summary"):
title = book_summary.find ("span", "book-title") or book_summary.find ("span", "book-title-quick")
title = title.string
print "\t", title
synopsis = book_summary.find ("p", "synopsis").string
summary = book_summary.find ("p", "summary")
if summary:
summary = summary.string
image = book_summary.find ("img", "thumblet")
if image:
image_url = urlparse.urljoin (URL, image['src'])
image_data = urllib.urlopen (image_url).read ()
else:
image_data = ""
book_reviews.append ((title, synopsis, summary, buffer (image_data), date))
db = sqlite3.connect ("books.db")
db.execute ("DELETE FROM whatsnew")
db.executemany (
"INSERT INTO whatsnew (title, synopsis, summary, image, date_added) VALUES (?, ?, ?, ?, ?)",
book_reviews
)
db.commit ()