]> git.nbdom.net Git - nb.git/commitdiff
bin/rss-feedparser
authorNicolas Boisselier <nicolas.boisselier@gmail.com>
Sat, 23 Jun 2018 03:47:16 +0000 (04:47 +0100)
committerNicolas Boisselier <nicolas.boisselier@gmail.com>
Sat, 23 Jun 2018 03:47:16 +0000 (04:47 +0100)
bin/rss-feedparser [new file with mode: 0755]

diff --git a/bin/rss-feedparser b/bin/rss-feedparser
new file mode 100755 (executable)
index 0000000..0bf759e
--- /dev/null
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+from bs4 import BeautifulSoup
+import re
+import feedparser
+import time
+import datetime
+from dateutil import parser
+# Function to fetch the rss feed and return the parsed RSS
+def parseRSS( rss_url ):
+    return feedparser.parse( rss_url ) 
+    
+# Function grabs the rss feed headlines (titles) and returns them as a list
+def getHeadlines( rss_url ):
+    headlines = []
+    
+    feed = parseRSS( rss_url )
+# NB 22.06.18     for key in feed['feed']:
+# NB 22.06.18         print(key)
+    #print(feed['feed']['title'])
+    #print(feed['items'])
+    for newsitem in feed['items']:
+        print()
+        for k,v in newsitem.items():
+            print(k,' : ',v)
+    return
+    for newsitem in feed['items']:
+        line = [
+        ]
+        #headlines.append(newsitem['content'][0]['value'].replace("\n",""))
+        #print(newsitem)
+
+        # Channel Title
+        line.append(feed['feed']['title'])
+
+        # Date
+        if 'published_parsed' in newsitem:
+            published = time.strftime('%F %T',newsitem["published_parsed"])
+        elif 'updated_parsed' in newsitem:
+            published = time.strftime('%F %T',newsitem["updated_parsed"])
+        else:
+            continue
+
+        line.append(published)
+
+        # Title
+        if 'title' in newsitem:
+            line.append(newsitem['title'])
+        else:
+            continue
+
+        # Content
+        content = ''
+        if 'content' in newsitem:
+            html = newsitem['content'][0]['value']
+        elif 'description' in newsitem:
+            html = newsitem['description']
+        elif 'Zmedia_content' in newsitem:
+            content = newsitem['media_content']
+            print(content)
+        else:
+            html = ''
+
+        if not content:
+            soup = BeautifulSoup(html,"html.parser")
+            content = re.sub(r'\s*\n',' ',soup.get_text())
+            #content = soup.get_text().replace("\n"," ")
+        line.append(content)
+
+        #print(soup.findAll('a',attrs={'href': re.compile("youtube.*/embed")}))
+
+        print("\t".join(line))
+    
+    return headlines
+# A list to hold all headlines
+allheadlines = []
+# List of RSS feeds that we will fetch and combine
+urls = {
+    #'https://feeds.feedburner.com/wrc/',
+    #'https://www.egaliteetreconciliation.fr/spip.php?page=backend',
+    'https://www.youtube.com/feeds/videos.xml?channel_id=UCUIjs9R044OjAxKzk0xhGoQ',
+}
+
+for url in urls:
+    getHeadlines( url )
+