Python: kleiner Server zum Download von URLs

Das Script dient zum automatischen Download der RSS-Feeds meiner OPML-Liste…

config.ini

# Configuration entry for SQL
# Only MySQL can be used

# self explaining
host=localhost
user=root
pw=root
db=test
# you __must__ define the URL as remote_url
# and ID as remote_id
query=SELECT url AS remote_url, id AS remote_id FROM test_table

[threading]
# fetch site list every x seconds from database
requery_sites_in_sec=10.0
# time between next page download execution in seconds
wait_between_query_in_sec=10.0

[output]
# output to dir
to_dir=/tmp
# you can use these variables:
#   $remote_file_name prefix of filename in URL (e.g. http://url/file.html => file)
#   $remote_id ID from database
#   $remote_file_extension suffix of filename in URL (e.g. http://url/file.bla.html => html)
filename=$remote_file_name.$remote_id.$remote_file_extension

collect-page-daemon.py

# collect-page-daemon.py reads URLs from a database table and
# performs a download of the URLs to the local system.
# The time between next download try can be configured in the config.ini.
# I wrote this script for automatic downloading of my favourite RSS-feeds
from threading import Thread
import ConfigParser
import string
import MySQLdb
import sys
import time
import os
import urllib

# read configuration from config.ini
config = ConfigParser.ConfigParser()
config.read("config.ini")
sql_host = config.get("sql", "host")
sql_user = config.get("sql", "user")
sql_password = config.get("sql", "pw")
sql_db = config.get("sql", "db")
sql_query = config.get("sql", "query")
wait_between_query_in_sec = config.get("threading", "wait_between_query_in_sec")
requery_sites_in_sec = config.get("threading", "requery_sites_in_sec")
output_filename = config.get("output", "filename")
output_to_dir = config.get("output", "to_dir")

# open connection to MySQL database
conn = MySQLdb.connect(host = sql_host, user = sql_user, passwd = sql_password, db = sql_db)
# use column names as hash key
cursor = conn.cursor(MySQLdb.cursors.DictCursor)

# debug output for command line
def dbg(output):
print " [+] " + time.strftime("%Y-%m-%d %H:%M:%S") + " " + output

# class for querying a single web page
class QueryPageTask(Thread):
def __init__(self, id, url):
Thread.__init__(self)
# unique id of site
self.id = id
# url of file to download
self.url = url
# flag for daemon check
self.running = 1

# is task still running?
def isRunning(self, running):
self.running = running

# execute task
def run(self):
while self.running:
dbg("Executing query page task for " + self.url)

# extract file name and corresponding extension
idx = self.url.rfind("/");
url_filename = self.url[idx + 1:]
idx = url_filename.rfind(".");
extension = url_filename[idx + 1:]
file = url_filename[0:idx]

# replace template variables
filename = output_filename
filename = filename.replace("$remote_id", str(self.id))
filename = filename.replace("$remote_file_extension", extension)
filename = filename.replace("$remote_file_name", file)

local_filepath = output_to_dir + "/" + filename

# unlink existent file
if os.path.exists(local_filepath):
os.unlink(local_filepath)

# retrieve with help of urllib and save to local file
urllib.urlretrieve(self.url, local_filepath)

dbg("Next query for " + self.url + " in " + str(requery_sites_in_sec) + " seconds")
# doh!
time.sleep(float(requery_sites_in_sec) * 1)

# this class observes the database table for changes
class DatabaseObserverTask(Thread):
def __init__(self):
Thread.__init__(self)
# holds all threads
self.arrThreads = []

def run(self):
while 1:
dbg("Observer task running")
iThreads = len(self.arrThreads)

# running threads?
if iThreads > 0:
dbg("Stopping " + str(iThreads) + " threads")

# kill every thread / set daemon flag to false
while iThreads > 0:
current_thread = self.arrThreads.pop((iThreads - 1))
current_thread.isRunning(0)
# TODO really needed?
del current_thread
iThreads = iThreads - 1

# fetch data from table
cursor.execute(sql_query)
rows = cursor.fetchall()

iEntries = len(rows)
dbg("Loaded " + str(iEntries) + " entries from database")

# create for each table entry a new task and append the task to our class array
for row in rows:
task_query = QueryPageTask(row["remote_id"], row["remote_url"])
task_query.start()
self.arrThreads.append(task_query)

dbg("Next requery execution in " + str(requery_sites_in_sec) + " seconds")

# doh!
time.sleep(float(requery_sites_in_sec) * 1)

# never reached
cursor.close()
conn.close()

dbg("Starting...")

# create new task instance and start
task_observer = DatabaseObserverTask()
task_observer.start()

dbg("Shutdown.")

Leave a reply

Your email address will not be published.

You may use these HTML tags and attributes:

<a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>