From 577ae00053ef9d8d99af33aae59beba62f231034 Mon Sep 17 00:00:00 2001 From: Nicolas Boisselier Date: Sat, 23 Jun 2018 04:47:16 +0100 Subject: [PATCH] bin/rss-feedparser --- bin/rss-feedparser | 88 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100755 bin/rss-feedparser diff --git a/bin/rss-feedparser b/bin/rss-feedparser new file mode 100755 index 00000000..0bf759e1 --- /dev/null +++ b/bin/rss-feedparser @@ -0,0 +1,88 @@ +#!/usr/bin/env python3 +from bs4 import BeautifulSoup +import re +import feedparser +import time +import datetime +from dateutil import parser + +# Function to fetch the rss feed and return the parsed RSS +def parseRSS( rss_url ): + return feedparser.parse( rss_url ) + +# Function grabs the rss feed headlines (titles) and returns them as a list +def getHeadlines( rss_url ): + headlines = [] + + feed = parseRSS( rss_url ) +# NB 22.06.18 for key in feed['feed']: +# NB 22.06.18 print(key) + #print(feed['feed']['title']) + #print(feed['items']) + for newsitem in feed['items']: + print() + for k,v in newsitem.items(): + print(k,' : ',v) + return + for newsitem in feed['items']: + line = [ + ] + #headlines.append(newsitem['content'][0]['value'].replace("\n","")) + #print(newsitem) + + # Channel Title + line.append(feed['feed']['title']) + + # Date + if 'published_parsed' in newsitem: + published = time.strftime('%F %T',newsitem["published_parsed"]) + elif 'updated_parsed' in newsitem: + published = time.strftime('%F %T',newsitem["updated_parsed"]) + else: + continue + + line.append(published) + + # Title + if 'title' in newsitem: + line.append(newsitem['title']) + else: + continue + + # Content + content = '' + if 'content' in newsitem: + html = newsitem['content'][0]['value'] + elif 'description' in newsitem: + html = newsitem['description'] + elif 'Zmedia_content' in newsitem: + content = newsitem['media_content'] + print(content) + else: + html = '' + + if not content: + soup = BeautifulSoup(html,"html.parser") + content = re.sub(r'\s*\n',' ',soup.get_text()) + #content = soup.get_text().replace("\n"," ") + line.append(content) + + #print(soup.findAll('a',attrs={'href': re.compile("youtube.*/embed")})) + + print("\t".join(line)) + + return headlines + +# A list to hold all headlines +allheadlines = [] + +# List of RSS feeds that we will fetch and combine +urls = { + #'https://feeds.feedburner.com/wrc/', + #'https://www.egaliteetreconciliation.fr/spip.php?page=backend', + 'https://www.youtube.com/feeds/videos.xml?channel_id=UCUIjs9R044OjAxKzk0xhGoQ', +} + +for url in urls: + getHeadlines( url ) + -- 2.47.3