rawdog

' day = safe_ftime(self.config["dayformat"], tm) print >>self.file, '

' + day + '

' self.counter += 1 def start_time(self, tm): print >>self.file, '

' clock = safe_ftime(self.config["timeformat"], tm) print >>self.file, '

' + clock + '

' self.counter += 1 def time(self, s): try: tm = time.localtime(s) except ValueError: # e.g. "timestamp out of range for platform time_t" return if tm[:3] != self.lasttime[:3] and self.config["daysections"]: self.close(0) self.start_day(tm) if tm[:6] != self.lasttime[:6] and self.config["timesections"]: if self.config["daysections"]: self.close(1) else: self.close(0) self.start_time(tm) self.lasttime = tm def close(self, n=0): while self.counter > n: print >>self.file, "

" self.counter -= 1 def parse_time(value, default="m"): """Parse a time period with optional units (s, m, h, d, w) into a time in seconds. If no unit is specified, use minutes by default; specify the default argument to change this. Raises ValueError if the format isn't recognised.""" units = { "s": 1, "m": 60, "h": 3600, "d": 86400, "w": 604800, } for unit, size in units.items(): if value.endswith(unit): return int(value[:-len(unit)]) * size return int(value) * units[default] def parse_bool(value): """Parse a boolean value (0, 1, false or true). Raise ValueError if the value isn't recognised.""" value = value.strip().lower() if value == "0" or value == "false": return False elif value == "1" or value == "true": return True else: raise ValueError("Bad boolean value: " + value) def parse_list(value): """Parse a list of keywords separated by whitespace.""" return value.strip().split(None) def parse_feed_args(argparams, arglines): """Parse a list of feed arguments. Raise ConfigError if the syntax is invalid, or ValueError if an argument value can't be parsed.""" args = {} for p in argparams: ps = p.split("=", 1) if len(ps) != 2: raise ConfigError("Bad feed argument in config: " + p) args[ps[0]] = ps[1] for p in arglines: ps = p.split(None, 1) if len(ps) != 2: raise ConfigError("Bad argument line in config: " + p) args[ps[0]] = ps[1] for name, value in args.items(): if name == "allowduplicates": args[name] = parse_bool(value) elif name == "keepmin": args[name] = int(value) elif name == "maxage": args[name] = parse_time(value) return args class ConfigError(Exception): pass class Config: """The aggregator's configuration.""" def __init__(self, locking=True, logfile_name=None): self.locking = locking self.files_loaded = [] self.loglock = threading.Lock() self.logfile = None if logfile_name: self.logfile = open(logfile_name, "a") self.reset() def reset(self): # Note that these default values are *not* the same as # in the supplied config file. The idea is that someone # who has an old config file shouldn't notice a difference # in behaviour on upgrade -- so new options generally # default to False here, and True in the sample file. self.config = { "feedslist" : [], "feeddefaults" : {}, "defines" : {}, "outputfile" : "output.html", "oldpages" : 7, "maxarticles" : 200, "maxage" : 0, "expireage" : 24 * 60 * 60, "keepmin" : 0, "dayformat" : "%A, %d %B %Y", "timeformat" : "%I:%M %p", "datetimeformat" : None, "userefresh" : False, "showfeeds" : True, "timeout" : 30, "pagetemplate" : "default", "itemtemplate" : "default", "feedlisttemplate" : "default", "feeditemtemplate" : "default", "verbose" : False, "ignoretimeouts" : False, "showtracebacks" : False, "daysections" : True, "timesections" : True, "blocklevelhtml" : True, "tidyhtml" : False, "sortbyfeeddate" : False, "currentonly" : False, "hideduplicates" : [], "newfeedperiod" : "3h", "changeconfig": False, "numthreads": 1, "splitstate": False, "useids": False, } def __getitem__(self, key): return self.config[key] def get(self, key, default=None): return self.config.get(key, default) def __setitem__(self, key, value): self.config[key] = value def reload(self): self.log("Reloading config files") self.reset() for filename in self.files_loaded: self.load(filename, False) def load(self, filename, explicitly_loaded=True): """Load configuration from a config file.""" if explicitly_loaded: self.files_loaded.append(filename) lines = [] try: f = open(filename, "r") for line in f.xreadlines(): try: line = line.decode(get_system_encoding()) except UnicodeDecodeError, e: raise ConfigError("Character encoding problem in config file: " + filename + ": " + str(e)) stripped = line.strip() if stripped == "" or stripped[0] == "#": continue if line[0] in string.whitespace: if lines == []: raise ConfigError("First line in config cannot be an argument") lines[-1][1].append(stripped) else: lines.append((stripped, [])) f.close() except IOError: raise ConfigError("Can't read config file: " + filename) for line, arglines in lines: try: self.load_line(line, arglines) except ValueError: raise ConfigError("Bad value in config: " + line) def load_line(self, line, arglines): """Process a configuration directive.""" l = line.split(None, 1) if len(l) == 1 and l[0] == "feeddefaults": l.append("") elif len(l) != 2: raise ConfigError("Bad line in config: " + line) # Load template files immediately, so we produce an error now # rather than later if anything goes wrong. if l[0].endswith("template") and l[1] != "default": load_file(l[1]) handled_arglines = False if l[0] == "feed": l = l[1].split(None) if len(l) < 2: raise ConfigError("Bad line in config: " + line) self["feedslist"].append((l[1], parse_time(l[0]), parse_feed_args(l[2:], arglines))) handled_arglines = True elif l[0] == "feeddefaults": self["feeddefaults"] = parse_feed_args(l[1].split(None), arglines) handled_arglines = True elif l[0] == "define": l = l[1].split(None, 1) if len(l) != 2: raise ConfigError("Bad line in config: " + line) self["defines"][l[0]] = l[1] elif l[0] == "plugindirs": for dir in parse_list(l[1]): load_plugins(dir, self) elif l[0] == "outputfile": self["outputfile"] = l[1] elif l[0] == "oldpages": self["oldpages"] = l[1] elif l[0] == "maxarticles": self["maxarticles"] = int(l[1]) elif l[0] == "maxage": self["maxage"] = parse_time(l[1]) elif l[0] == "expireage": self["expireage"] = parse_time(l[1]) elif l[0] == "keepmin": self["keepmin"] = int(l[1]) elif l[0] == "dayformat": self["dayformat"] = l[1] elif l[0] == "timeformat": self["timeformat"] = l[1] elif l[0] == "datetimeformat": self["datetimeformat"] = l[1] elif l[0] == "userefresh": self["userefresh"] = parse_bool(l[1]) elif l[0] == "showfeeds": self["showfeeds"] = parse_bool(l[1]) elif l[0] == "timeout": self["timeout"] = parse_time(l[1], "s") elif l[0] in ("template", "pagetemplate"): self["pagetemplate"] = l[1] elif l[0] == "itemtemplate": self["itemtemplate"] = l[1] elif l[0] == "feedlisttemplate": self["feedlisttemplate"] = l[1] elif l[0] == "feeditemtemplate": self["feeditemtemplate"] = l[1] elif l[0] == "verbose": self["verbose"] = parse_bool(l[1]) elif l[0] == "ignoretimeouts": self["ignoretimeouts"] = parse_bool(l[1]) elif l[0] == "showtracebacks": self["showtracebacks"] = parse_bool(l[1]) elif l[0] == "daysections": self["daysections"] = parse_bool(l[1]) elif l[0] == "timesections": self["timesections"] = parse_bool(l[1]) elif l[0] == "blocklevelhtml": self["blocklevelhtml"] = parse_bool(l[1]) elif l[0] == "tidyhtml": self["tidyhtml"] = parse_bool(l[1]) elif l[0] == "sortbyfeeddate": self["sortbyfeeddate"] = parse_bool(l[1]) elif l[0] == "currentonly": self["currentonly"] = parse_bool(l[1]) elif l[0] == "hideduplicates": self["hideduplicates"] = parse_list(l[1]) elif l[0] == "newfeedperiod": self["newfeedperiod"] = l[1] elif l[0] == "changeconfig": self["changeconfig"] = parse_bool(l[1]) elif l[0] == "numthreads": self["numthreads"] = int(l[1]) elif l[0] == "splitstate": self["splitstate"] = parse_bool(l[1]) elif l[0] == "useids": self["useids"] = parse_bool(l[1]) elif l[0] == "include": self.load(l[1], False) elif call_hook("config_option_arglines", self, l[0], l[1], arglines): handled_arglines = True elif call_hook("config_option", self, l[0], l[1]): pass else: raise ConfigError("Unknown config command: " + l[0]) if arglines != [] and not handled_arglines: raise ConfigError("Bad argument lines in config after: " + line) def log(self, *args): """Print a status message. If running in verbose mode, write the message to stderr; if using a logfile, write it to the logfile.""" if self["verbose"]: with self.loglock: print >>sys.stderr, "".join(map(str, args)) if self.logfile is not None: with self.loglock: print >>self.logfile, "".join(map(str, args)) self.logfile.flush() def bug(self, *args): """Report detection of a bug in rawdog.""" print >>sys.stderr, "Internal error detected in rawdog:" print >>sys.stderr, "".join(map(str, args)) print >>sys.stderr, "This could be caused by a bug in rawdog itself or in a plugin." print >>sys.stderr, "Please send this error message and your config file to the rawdog author." def edit_file(filename, editfunc): """Edit a file in place: for each line in the input file, call editfunc(line, outputfile), then rename the output file over the input file.""" newname = "%s.new-%d" % (filename, os.getpid()) oldfile = open(filename, "r") newfile = open(newname, "w") editfunc(oldfile, newfile) newfile.close() oldfile.close() os.rename(newname, filename) class AddFeedEditor: def __init__(self, feedline): self.feedline = feedline def edit(self, inputfile, outputfile): d = inputfile.read() outputfile.write(d) if not d.endswith("\n"): outputfile.write("\n") outputfile.write(self.feedline) def add_feed(filename, url, rawdog, config): """Try to add a feed to the config file.""" feeds = rawdoglib.feedscanner.feeds(url) if feeds == []: print >>sys.stderr, "Cannot find any feeds in " + url return feed = feeds[0] if feed in rawdog.feeds: print >>sys.stderr, "Feed " + feed + " is already in the config file" return print >>sys.stderr, "Adding feed " + feed feedline = "feed %s %s\n" % (config["newfeedperiod"], feed) edit_file(filename, AddFeedEditor(feedline).edit) class ChangeFeedEditor: def __init__(self, oldurl, newurl): self.oldurl = oldurl self.newurl = newurl def edit(self, inputfile, outputfile): for line in inputfile.xreadlines(): ls = line.strip().split(None) if len(ls) > 2 and ls[0] == "feed" and ls[2] == self.oldurl: line = line.replace(self.oldurl, self.newurl, 1) outputfile.write(line) class RemoveFeedEditor: def __init__(self, url): self.url = url def edit(self, inputfile, outputfile): while True: l = inputfile.readline() if l == "": break ls = l.strip().split(None) if len(ls) > 2 and ls[0] == "feed" and ls[2] == self.url: while True: l = inputfile.readline() if l == "": break elif l[0] == "#": outputfile.write(l) elif l[0] not in string.whitespace: outputfile.write(l) break else: outputfile.write(l) def remove_feed(filename, url, config): """Try to remove a feed from the config file.""" if url not in [f[0] for f in config["feedslist"]]: print >>sys.stderr, "Feed " + url + " is not in the config file" else: print >>sys.stderr, "Removing feed " + url edit_file(filename, RemoveFeedEditor(url).edit) class FeedFetcher: """Class that will handle fetching a set of feeds in parallel.""" def __init__(self, rawdog, feedlist, config): self.rawdog = rawdog self.config = config self.lock = threading.Lock() self.jobs = set(feedlist) self.results = {} def worker(self, num): rawdog = self.rawdog config = self.config while True: with self.lock: try: job = self.jobs.pop() except KeyError: # No jobs left. break config.log("[", num, "] Fetching feed: ", job) feed = rawdog.feeds[job] call_hook("pre_update_feed", rawdog, config, feed) result = feed.fetch(rawdog, config) with self.lock: self.results[job] = result def run(self, max_workers): max_workers = max(max_workers, 1) num_workers = min(max_workers, len(self.jobs)) self.config.log("Fetching ", len(self.jobs), " feeds using ", num_workers, " threads") workers = [] for i in range(1, num_workers): t = threading.Thread(target=self.worker, args=(i,)) t.start() workers.append(t) self.worker(0) for worker in workers: worker.join() self.config.log("Fetch complete") return self.results class FeedState(Persistable): """The collection of articles in a feed.""" def __init__(self): Persistable.__init__(self) self.articles = {} class Rawdog(Persistable): """The aggregator itself.""" def __init__(self): Persistable.__init__(self) self.feeds = {} self.articles = {} self.plugin_storage = {} self.state_version = STATE_VERSION self.using_splitstate = None def get_plugin_storage(self, plugin): try: st = self.plugin_storage.setdefault(plugin, {}) except AttributeError: # rawdog before 2.5 didn't have plugin storage. st = {} self.plugin_storage = {plugin: st} return st def check_state_version(self): """Check the version of the state file.""" try: version = self.state_version except AttributeError: # rawdog 1.x didn't keep track of this. version = 1 return version == STATE_VERSION def change_feed_url(self, oldurl, newurl, config): """Change the URL of a feed.""" assert self.feeds.has_key(oldurl) if self.feeds.has_key(newurl): print >>sys.stderr, "Error: New feed URL is already subscribed; please remove the old one" print >>sys.stderr, "from the config file by hand." return edit_file("config", ChangeFeedEditor(oldurl, newurl).edit) feed = self.feeds[oldurl] # Changing the URL will change the state filename as well, # so we need to save the old name to load from. old_state = feed.get_state_filename() feed.url = newurl del self.feeds[oldurl] self.feeds[newurl] = feed if config["splitstate"]: feedstate_p = persister.get(FeedState, old_state) feedstate_p.rename(feed.get_state_filename()) with feedstate_p as feedstate: for article in feedstate.articles.values(): article.feed = newurl feedstate.modified() else: for article in self.articles.values(): if article.feed == oldurl: article.feed = newurl print >>sys.stderr, "Feed URL automatically changed." def list(self, config): """List the configured feeds.""" for url, feed in self.feeds.items(): feed_info = feed.feed_info print url print " ID:", feed.get_id(config) print " Hash:", short_hash(url) print " Title:", feed.get_html_name(config) print " Link:", feed_info.get("link") def sync_from_config(self, config): """Update rawdog's internal state to match the configuration.""" # Make sure the splitstate directory exists. if config["splitstate"]: try: os.mkdir("feeds") except OSError: # Most likely it already exists. pass # Convert to or from splitstate if necessary. try: u = self.using_splitstate except AttributeError: # We were last run with a version of rawdog that didn't # have this variable -- so we must have a single state # file. u = False if u is None: self.using_splitstate = config["splitstate"] elif u != config["splitstate"]: if config["splitstate"]: config.log("Converting to split state files") for feed_hash, feed in self.feeds.items(): with persister.get(FeedState, feed.get_state_filename()) as feedstate: feedstate.articles = {} for article_hash, article in self.articles.items(): if article.feed == feed_hash: feedstate.articles[article_hash] = article feedstate.modified() self.articles = {} else: config.log("Converting to single state file") self.articles = {} for feed_hash, feed in self.feeds.items(): with persister.get(FeedState, feed.get_state_filename()) as feedstate: for article_hash, article in feedstate.articles.items(): self.articles[article_hash] = article feedstate.articles = {} feedstate.modified() persister.delete(feed.get_state_filename()) self.modified() self.using_splitstate = config["splitstate"] seen_feeds = set() for (url, period, args) in config["feedslist"]: seen_feeds.add(url) if not self.feeds.has_key(url): config.log("Adding new feed: ", url) self.feeds[url] = Feed(url) self.modified() feed = self.feeds[url] if feed.period != period: config.log("Changed feed period: ", url) feed.period = period self.modified() newargs = {} newargs.update(config["feeddefaults"]) newargs.update(args) if feed.args != newargs: config.log("Changed feed options: ", url) feed.args = newargs self.modified() for url in self.feeds.keys(): if url not in seen_feeds: config.log("Removing feed: ", url) if config["splitstate"]: persister.delete(self.feeds[url].get_state_filename()) else: for key, article in self.articles.items(): if article.feed == url: del self.articles[key] del self.feeds[url] self.modified() def update(self, config, feedurl=None): """Perform the update action: check feeds for new articles, and expire old ones.""" config.log("Starting update") now = time.time() socket.setdefaulttimeout(config["timeout"]) if feedurl is None: update_feeds = [url for url in self.feeds.keys() if self.feeds[url].needs_update(now)] elif self.feeds.has_key(feedurl): update_feeds = [feedurl] self.feeds[feedurl].etag = None self.feeds[feedurl].modified = None else: print "No such feed: " + feedurl update_feeds = [] numfeeds = len(update_feeds) config.log("Will update ", numfeeds, " feeds") fetcher = FeedFetcher(self, update_feeds, config) fetched = fetcher.run(config["numthreads"]) seen_some_items = set() def do_expiry(articles): """Expire articles from a list. Return True if any articles were expired.""" feedcounts = {} for key, article in articles.items(): url = article.feed feedcounts[url] = feedcounts.get(url, 0) + 1 expiry_list = [] feedcounts = {} for key, article in articles.items(): url = article.feed feedcounts[url] = feedcounts.get(url, 0) + 1 expiry_list.append((article.added, article.sequence, key, article)) expiry_list.sort() count = 0 for date, seq, key, article in expiry_list: url = article.feed if url not in self.feeds: config.log("Expired article for nonexistent feed: ", url) count += 1 del articles[key] continue if (url in seen_some_items and self.feeds.has_key(url) and article.can_expire(now, config) and feedcounts[url] > self.feeds[url].get_keepmin(config)): call_hook("article_expired", self, config, article, now) count += 1 feedcounts[url] -= 1 del articles[key] config.log("Expired ", count, " articles, leaving ", len(articles)) return count > 0 count = 0 for url in update_feeds: count += 1 config.log("Updating feed ", count, " of ", numfeeds, ": ", url) feed = self.feeds[url] if config["splitstate"]: feedstate_p = persister.get(FeedState, feed.get_state_filename()) feedstate = feedstate_p.open() articles = feedstate.articles else: articles = self.articles content = fetched[url] call_hook("mid_update_feed", self, config, feed, content) rc = feed.update(self, now, config, articles, content) url = feed.url call_hook("post_update_feed", self, config, feed, rc) if rc: seen_some_items.add(url) if config["splitstate"]: feedstate.modified() if config["splitstate"]: if do_expiry(articles): feedstate.modified() feedstate_p.close() if config["splitstate"]: self.articles = {} else: do_expiry(self.articles) self.modified() config.log("Finished update") def get_template(self, config, name="page"): """Return the contents of a template.""" filename = config.get(name + "template", "default") if filename != "default": return load_file(filename) if name == "page": template = """ """ if config["userefresh"]: template += """__refresh__ """ template += """ rawdog

__items__

""" if config["showfeeds"]: template += """

Feeds

__feeds__

""" template += """ """ return template elif name == "item": return """

__title__ [__feed_title__]

__if_description__

__description__

__endif__

""" elif name == "feedlist": return """ __feeditems__

Feed	RSS	Last fetched	Next fetched after

""" elif name == "feeditem": return """ __feed_title__ __feed_icon__ __feed_last_update__ __feed_next_update__ """ else: raise KeyError("Unknown template name: " + name) def show_template(self, name, config): """Show the contents of a template, as currently configured.""" try: print self.get_template(config, name), except KeyError: print >>sys.stderr, "Unknown template name: " + name def write_article(self, f, article, config): """Write an article to the given file.""" feed = self.feeds[article.feed] entry_info = article.entry_info link = entry_info.get("link") if link == "": link = None guid = entry_info.get("id") if guid == "": guid = None itembits = self.get_feed_bits(config, feed) for name, value in feed.args.items(): if name.startswith("define_"): itembits[name[7:]] = sanitise_html(value, "", True, config) title = detail_to_html(entry_info.get("title_detail"), True, config) key = None for k in ["content", "summary_detail"]: if entry_info.has_key(k): key = k break if key is None: description = None else: force_preformatted = (feed.args.get("format", "default") == "text") description = detail_to_html(entry_info[key], False, config, force_preformatted) date = article.date if title is None: if link is None: title = "Article" else: title = "Link" itembits["title_no_link"] = title if link is not None: itembits["url"] = string_to_html(link, config) else: itembits["url"] = "" if guid is not None: itembits["guid"] = string_to_html(guid, config) else: itembits["guid"] = "" if link is None: itembits["title"] = title else: itembits["title"] = '' + title + '' itembits["hash"] = short_hash(article.hash) if description is not None: itembits["description"] = description else: itembits["description"] = "" author = author_to_html(entry_info, feed.url, config) if author is not None: itembits["author"] = author else: itembits["author"] = "" itembits["added"] = format_time(article.added, config) if date is not None: itembits["date"] = format_time(date, config) else: itembits["date"] = "" call_hook("output_item_bits", self, config, feed, article, itembits) itemtemplate = self.get_template(config, "item") f.write(fill_template(itemtemplate, itembits)) def write_remove_dups(self, articles, config, now): """Filter the list of articles to remove articles that are too old or are duplicates.""" kept_articles = [] seen_links = set() seen_guids = set() dup_count = 0 for article in articles: feed = self.feeds[article.feed] age = now - article.added maxage = feed.args.get("maxage", config["maxage"]) if maxage != 0 and age > maxage: continue entry_info = article.entry_info link = entry_info.get("link") if link == "": link = None guid = entry_info.get("id") if guid == "": guid = None if not feed.args.get("allowduplicates", False): is_dup = False for key in config["hideduplicates"]: if key == "id" and guid is not None: if guid in seen_guids: is_dup = True seen_guids.add(guid) elif key == "link" and link is not None: if link in seen_links: is_dup = True seen_links.add(link) if is_dup: dup_count += 1 continue kept_articles.append(article) return (kept_articles, dup_count) def get_feed_bits(self, config, feed): """Get the bits that are used to describe a feed.""" bits = {} bits["feed_id"] = feed.get_id(config) bits["feed_hash"] = short_hash(feed.url) bits["feed_title"] = feed.get_html_link(config) bits["feed_title_no_link"] = detail_to_html(feed.feed_info.get("title_detail"), True, config) bits["feed_url"] = string_to_html(feed.url, config) bits["feed_icon"] = 'XML' bits["feed_last_update"] = format_time(feed.last_update, config) bits["feed_next_update"] = format_time(feed.last_update + feed.period, config) return bits def write_feeditem(self, f, feed, config): """Write a feed list item.""" bits = self.get_feed_bits(config, feed) f.write(fill_template(self.get_template(config, "feeditem"), bits)) def write_feedlist(self, f, config): """Write the feed list.""" bits = {} feeds = [(feed.get_html_name(config).lower(), feed) for feed in self.feeds.values()] feeds.sort() feeditems = StringIO() for key, feed in feeds: self.write_feeditem(feeditems, feed, config) bits["feeditems"] = feeditems.getvalue() feeditems.close() f.write(fill_template(self.get_template(config, "feedlist"), bits)) def get_main_template_bits(self, config): """Get the bits that are used in the default main template, with the exception of items and num_items.""" bits = {"version": VERSION} bits.update(config["defines"]) refresh = min([config["expireage"]] + [feed.period for feed in self.feeds.values()]) bits["refresh"] = '' f = StringIO() self.write_feedlist(f, config) bits["feeds"] = f.getvalue() f.close() bits["num_feeds"] = str(len(self.feeds)) return bits def write_output_file(self, articles, article_dates, config, oldpage=0): """Write a regular rawdog HTML output file.""" f = StringIO() dw = DayWriter(f, config) call_hook("output_items_begin", self, config, f) for article in articles: if not call_hook("output_items_heading", self, config, f, article, article_dates[article]): dw.time(article_dates[article]) self.write_article(f, article, config) dw.close() call_hook("output_items_end", self, config, f) if oldpage != config["oldpages"]: filename = config["outputfile"].split("/")[-1:][0] # get the filename only filename = filename.split(".html") outputfile = filename[0] + str(oldpage+1) + ".html" f.write('

Older blog entries

') bits = self.get_main_template_bits(config) bits["items"] = f.getvalue() f.close() bits["num_items"] = str(len(articles)) call_hook("output_bits", self, config, bits) s = fill_template(self.get_template(config, "page"), bits) if oldpage > 0: filename = config["outputfile"].split(".html") outputfile = filename[0] + str(oldpage) + ".html" else: outputfile = config["outputfile"] if outputfile == "-": write_ascii(sys.stdout, s, config) else: config.log("Writing output file: ", outputfile) f = open(outputfile + ".new", "w") write_ascii(f, s, config) f.close() os.rename(outputfile + ".new", outputfile) def write(self, config): """Perform the write action: write articles to the output file.""" config.log("Starting write") now = time.time() def list_articles(articles): return [(-a.get_sort_date(config), a.feed, a.sequence, a.hash) for a in articles.values()] if config["splitstate"]: article_list = [] for feed in self.feeds.values(): with persister.get(FeedState, feed.get_state_filename()) as feedstate: article_list += list_articles(feedstate.articles) else: article_list = list_articles(self.articles) numarticles = len(article_list) if not call_hook("output_sort_articles", self, config, article_list): article_list.sort() # if config["maxarticles"] != 0: # article_list = article_list[:config["maxarticles"]] if config["splitstate"]: wanted = {} for (date, feed_url, seq, hash) in article_list: if not feed_url in self.feeds: # This can happen if you've managed to # kill rawdog between it updating a # split state file and the main state # -- so just ignore the article and # it'll expire eventually. continue wanted.setdefault(feed_url, []).append(hash) found = {} for (feed_url, article_hashes) in wanted.items(): feed = self.feeds[feed_url] with persister.get(FeedState, feed.get_state_filename()) as feedstate: for hash in article_hashes: found[hash] = feedstate.articles[hash] else: found = self.articles articles = [] article_dates = {} for (date, feed, seq, hash) in article_list: a = found.get(hash) if a is not None: articles.append(a) article_dates[a] = -date call_hook("output_write", self, config, articles) if not call_hook("output_sorted_filter", self, config, articles): (articles, dup_count) = self.write_remove_dups(articles, config, now) else: dup_count = 0 config.log("Selected ", len(articles), " of ", numarticles, " articles to write; ignored ", dup_count, " duplicates") for page in range(0, config["oldpages"]+1): print "on page: " + str(page) if config["maxarticles"] != 0: pageArticles = articles[config["maxarticles"]*page:config["maxarticles"]*(page+1)] if not call_hook("output_write_files", self, config, pageArticles, article_dates): self.write_output_file(pageArticles, article_dates, config, page) config.log("Finished write") def usage(): """Display usage information.""" print """rawdog, version """ + VERSION + """ Usage: rawdog [OPTION]... General options (use only once): -d|--dir DIR Use DIR instead of ~/.rawdog -N, --no-locking Do not lock the state file -v, --verbose Print more detailed status information -V|--log FILE Append detailed status information to FILE -W, --no-lock-wait Exit silently if state file is locked Actions (performed in order given): -a|--add URL Try to find a feed associated with URL and add it to the config file -c|--config FILE Read additional config file FILE -f|--update-feed URL Force an update on the single feed URL -l, --list List feeds known at time of last update -r|--remove URL Remove feed URL from the config file -s|--show TEMPLATE Show the contents of a template (TEMPLATE may be: page item feedlist feeditem) -u, --update Fetch data from feeds and store it -w, --write Write out HTML output Special actions (all other options are ignored if one of these is specified): --dump URL Show what rawdog's parser returns for URL --help Display this help and exit Report bugs to .""" def main(argv): """The command-line interface to the aggregator.""" locale.setlocale(locale.LC_ALL, "") # This is quite expensive and not threadsafe, so we do it on # startup and cache the result. global system_encoding system_encoding = locale.getpreferredencoding() try: SHORTOPTS = "a:c:d:f:lNr:s:tTuvV:wW" LONGOPTS = [ "add=", "config=", "dir=", "dump=", "help", "list", "log=", "no-lock-wait", "no-locking", "remove=", "show=", "show-itemtemplate", "show-template", "update", "update-feed=", "verbose", "write", ] (optlist, args) = getopt.getopt(argv, SHORTOPTS, LONGOPTS) except getopt.GetoptError, s: print s usage() return 1 if len(args) != 0: usage() return 1 if "HOME" in os.environ: statedir = os.environ["HOME"] + "/.rawdog" else: statedir = None verbose = False logfile_name = None locking = True no_lock_wait = False for o, a in optlist: if o == "--dump": import pprint pprint.pprint(feedparser.parse(a, agent=HTTP_AGENT)) return 0 elif o == "--help": usage() return 0 elif o in ("-d", "--dir"): statedir = a elif o in ("-N", "--no-locking"): locking = False elif o in ("-v", "--verbose"): verbose = True elif o in ("-V", "--log"): logfile_name = a elif o in ("-W", "--no-lock-wait"): no_lock_wait = True if statedir is None: print "$HOME not set and state dir not explicitly specified; please use -d/--dir" return 1 try: os.chdir(statedir) except OSError: print "No " + statedir + " directory" return 1 sys.path.append(".") config = Config(locking, logfile_name) def load_config(fn): try: config.load(fn) except ConfigError, err: print >>sys.stderr, "In " + fn + ":" print >>sys.stderr, err return 1 if verbose: config["verbose"] = True return 0 rc = load_config("config") if rc != 0: return rc global persister persister = Persister(config) rawdog_p = persister.get(Rawdog, "state") rawdog = rawdog_p.open(no_block=no_lock_wait) if rawdog is None: return 0 if not rawdog.check_state_version(): print "The state file " + statedir + "/state was created by an older" print "version of rawdog, and cannot be read by this version." print "Removing the state file will fix it." return 1 rawdog.sync_from_config(config) call_hook("startup", rawdog, config) for o, a in optlist: if o in ("-a", "--add"): add_feed("config", a, rawdog, config) config.reload() rawdog.sync_from_config(config) elif o in ("-c", "--config"): rc = load_config(a) if rc != 0: return rc rawdog.sync_from_config(config) elif o in ("-f", "--update-feed"): rawdog.update(config, a) elif o in ("-l", "--list"): rawdog.list(config) elif o in ("-r", "--remove"): remove_feed("config", a, config) config.reload() rawdog.sync_from_config(config) elif o in ("-s", "--show"): rawdog.show_template(a, config) elif o in ("-t", "--show-template"): rawdog.show_template("page", config) elif o in ("-T", "--show-itemtemplate"): rawdog.show_template("item", config) elif o in ("-u", "--update"): rawdog.update(config) elif o in ("-w", "--write"): rawdog.write(config) call_hook("shutdown", rawdog, config) rawdog_p.close() return 0