# Use an official Python 3.6 image as a parent image
FROM python:3.6.4-slim-stretch
# Data Intelligence requires Tornado
RUN python3 -m pip --no-cache install tornado==5.0.2
RUN python3 -m pip install requests
RUN python3 -m pip install pandas
RUN python3 -m pip install beautifulsoup4
RUN python3 -m pip install lxml
# Add vflow user and vflow group to prevent error
# container has runAsNonRoot and image will run as root
RUN groupadd -g 1972 vflow && useradd -g 1972 -u 1972 -m vflow
USER 1972:1972
WORKDIR /home/vflow
ENV HOME=/home/vflow
import requests
import pandas as pd
from bs4 import *
url = "http://feeds.bbci.co.uk/news/rss.xml"
resp = requests.get(url)
soup = BeautifulSoup(resp.content, features="xml")
items = soup.findAll('item')
news_items = []
for each_item in items:
news_item = {}
news_item['RSS_TITLE'] = each_item.title.text
news_item['RSS_DESC'] = each_item.description.text
news_item['RSS_LINK'] = each_item.link.text
news_item['RSS_DATE'] = each_item.pubDate.text
news_items.append(news_item)
# Use a Pandas Dataframe to pass as CSV
df = pd.DataFrame(news_items)
df = df.to_csv(index=False, header=True, sep=";")
# Create Data Hub Message
attr = dict()
attr["message.commit.token"] = "stop-token"
messageout = api.Message(body=df, attributes=attr)
api.send("outmsg", messageout)
{
"name": "RSS_FEED",
"type": "record",
"fields": [
{
"name": "RSS_TITLE",
"type": "fixed",
"size": 128
},
{
"name": "RSS_DESC",
"type": "fixed",
"size": 2500
},
{
"name": "RSS_LINK",
"type": "fixed",
"size": 128
},
{
"name": "RSS_DATE",
"type": "fixed",
"size": 16
}
]
}
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
26 | |
25 | |
21 | |
13 | |
12 | |
9 | |
8 | |
8 | |
8 | |
8 |