In [1]:
# Shortly before the start of 2019 I decided to start taking my health a bit more seriously 
#  and educate myself on the subject.
# Luckily I found a good podcast just for that. I decided to leverage my python skills to write a little tool 
#  to download the episodes.
In [2]:
import requests               # library for html requests
from bs4 import BeautifulSoup # parsing rss feed
import os                     # handling folders and files
In [3]:
# get the feed file, prepare for parsing
url = 'http://themodelhealthshow.libsyn.com/rss'
rss = requests.get(url)
rss_soup = BeautifulSoup(rss.text, "lxml")
In [4]:
# Let's inspect the feed text to see what we're looking for:
#  break it into lines and indent for easier search
text = rss.text
text_lines = text.split('\n')
pretty_print = rss_soup.prettify().split('\n')
In [5]:
# First 150 lines:
first_lines = pretty_print[:150]
for line in first_lines :
    print(line)
<?xml version="1.0" encoding="UTF-8"?>
<html>
 <body>
  <rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:cc="http://web.resource.org/cc/" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:media="http://search.yahoo.com/mrss/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
   <channel>
    <atom:link href="http://themodelhealthshow.libsyn.com/rss" rel="self" type="application/rss+xml">
    </atom:link>
    <title>
     The Model Health Show
    </title>
    <pubdate>
     Tue, 28 May 2019 20:46:58 +0000
    </pubdate>
    <lastbuilddate>
     Wed, 29 May 2019 00:08:17 +0000
    </lastbuilddate>
    <generator>
     Libsyn WebEngine 2.0
    </generator>
    <link/>
    http://themodelhealthshow.com
    <language>
     en
    </language>
    <copyright>
    </copyright>
    <docs>
     http://themodelhealthshow.com
    </docs>
    <managingeditor>
     advancedintegrative@gmail.com (advancedintegrative@gmail.com)
    </managingeditor>
    <itunes:summary>
    </itunes:summary>
    <image>
     <url>
      http://static.libsyn.com/p/assets/c/9/4/f/c94f93c12d4b28e5/Podcast_original.jpg
     </url>
     <title>
      The Model Health Show
     </title>
     <link/>
    </image>
    <itunes:author>
     Shawn Stevenson
    </itunes:author>
    <itunes:keywords>
     fitness,health,motivation,nutrition,selfhelp,sex,wellness
    </itunes:keywords>
    <itunes:category text="Health">
     <itunes:category text="Fitness &amp; Nutrition">
     </itunes:category>
    </itunes:category>
    <itunes:category text="Health">
     <itunes:category text="Alternative Health">
     </itunes:category>
    </itunes:category>
    <itunes:category text="Health">
     <itunes:category text="Self-Help">
     </itunes:category>
    </itunes:category>
    <itunes:image href="http://static.libsyn.com/p/assets/c/9/4/f/c94f93c12d4b28e5/Podcast_original.jpg">
    </itunes:image>
    <itunes:explicit>
     clean
    </itunes:explicit>
    <itunes:owner>
     <itunes:name>
     </itunes:name>
     <itunes:email>
      advancedintegrative@gmail.com
     </itunes:email>
    </itunes:owner>
    <description>
    </description>
    <itunes:subtitle>
    </itunes:subtitle>
    <itunes:type>
     episodic
    </itunes:type>
    <item>
     <title>
      TMHS 353: Overcoming Our Secret Stressors &amp; How To Stop Being Hangry - With Guests Sarah Fragoso &amp; Dr. Brooke Kalanick
     </title>
     <itunes:title>
      Overcoming Our Secret Stressors &amp; How To Stop Being Hangry - With Guests Sarah Fragoso &amp; Dr. Brooke Kalanick
     </itunes:title>
     <pubdate>
      Tue, 28 May 2019 20:46:58 +0000
     </pubdate>
     <guid ispermalink="false">
     </guid>
     <link/>
     <itunes:image href="http://static.libsyn.com/p/assets/c/9/4/f/c94f93c12d4b28e5/Podcast_original.jpg">
     </itunes:image>
     <description>
      When it comes to managing your health and finding a nutrition approach that works best for your body, it’s important to remember that what’s common and what’s normal are two different things. Just because certain experiences and symptoms are prevalent in our culture doesn’t mean that they’re ideal.
      <span class="Apple-converted-space">
      </span>
      <p>
       For instance, well-known commercials insinuate that hanger (being so hungry that you become angry) is a common, laughable state of being. While hanger is a real emotion, it’s not necessarily something you should be experiencing on a regular basis. Controlling your hanger starts with understanding what’s physiologically happening in your body.
       <span class="Apple-converted-space">
       </span>
      </p>
      <p>
       On this episode, I’m joined by not one, but two incredible guests! Sarah Fragoso and Dr. Brooke Kalanick are here to share the science behind being hangry. You’ll learn about the five simple steps to balance your hormones, some surprising factors that could secretly be stressing you out, and how to customize your own personal healthy lifestyle with food, exercise, and stress management. Enjoy!
       <span class="Apple-converted-space">
       </span>
      </p>
      <p>
       <strong>
        In this episode you’ll discover:
       </strong>
      </p>
      <ul>
       <li>
        Why
        <strong>
         practicing self-love
        </strong>
        is critical for your overall health.
       </li>
       <li>
        What it means to own your moment.
        <span class="Apple-converted-space">
        </span>
       </li>
       <li>
        How women’s health issues are often overlooked and dismissed.
        <span class="Apple-converted-space">
        </span>
       </li>
       <li>
        The difference between common and normal.
       </li>
       <li>
        What
        <strong>
         ACES
        </strong>
        stands for, and how it relates to your hormones.
        <span class="Apple-converted-space">
        </span>
       </li>
       <li>
        The link between
        <strong>
         cortisol and insulin
        </strong>
        .
In [6]:
# Last 150 lines
last_lines = pretty_print[-150:]
for line in last_lines :
    print(line)
      Shawn Stevenson
     </itunes:author>
    </item>
    <item>
     <title>
      TMHS 001: Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure
     </title>
     <itunes:title>
      Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure
     </itunes:title>
     <pubdate>
      Thu, 02 May 2013 13:00:00 +0000
     </pubdate>
     <guid ispermalink="false">
     </guid>
     <link/>
     <itunes:image href="http://static.libsyn.com/p/assets/c/9/4/f/c94f93c12d4b28e5/Podcast_original.jpg">
     </itunes:image>
     <description>
      In this episode I break down the underlying cause of heart disease and share specific solutions for prevention. Heart disease is the leading cause of death in our world today, effecting 1 in every 3 people. With rates still on the rise, this is one of the most important topics for you to get educated about to protect yourself and those you care about.
      <h2 style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; line-height: 19px;">
       In this episode you'll discover:
      </h2>
      <ul style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <li>
        Why high cholesterol is NOT the cause of heart disease.
       </li>
       <li>
        How shift work and long work hours can damage your health long-term.
       </li>
       <li>
        The negative side effects of taking statins for high cholesterol.
       </li>
       <li>
        What causes high cholesterol and what to do about it.
       </li>
       <li>
        Why your hormones are an important factor in heart disease prevention.
       </li>
       <li>
        How your emotions and stress effect blood pressure.
       </li>
       <li>
        What commonly overlooked organ helps regulate your blood pressure (and it's NOT your heart).
       </li>
       <li>
        Five specific action steps to dramatically reduce you heart disease risk.
       </li>
      </ul>
      <h2 style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; line-height: 19px;">
       Items Mentioned in this podcast include:
      </h2>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="http://www.youtube.com/watch?v=sZw7Rh9px7E" rel="noopener" target="_blank">
        What is heart disease -YouTube
       </a>
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="http://articles.mercola.com/sites/articles/archive/2011/10/01/can-statin-drug-cause-liver-and-heart-to-fail.aspx" rel="noopener" target="_blank">
        Statin Drugs May Cause Liver Injury and Heart Failure
       </a>
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       Thank you so much for checking out this episode of The Model Health Show. If you haven’t done so already, please take a minute and leave a quick rating and review of the show on Apple Podcast by clicking on the link below. It will help us to keep delivering life-changing information for you every week!
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="https://itunes.apple.com/us/podcast/the-model-health-show/id640246578" rel="noopener" target="_blank">
        CLICK HERE to leave a review for the show!
       </a>
      </p>
      ]]&gt;
     </description>
     <content:encoded>
      In this episode I break down the underlying cause of heart disease and share specific solutions for prevention. Heart disease is the leading cause of death in our world today, effecting 1 in every 3 people. With rates still on the rise, this is one of the most important topics for you to get educated about to protect yourself and those you care about.  In this episode you'll discover:
      <ul style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <li>
        Why high cholesterol is NOT the cause of heart disease.
       </li>
       <li>
        How shift work and long work hours can damage your health long-term.
       </li>
       <li>
        The negative side effects of taking statins for high cholesterol.
       </li>
       <li>
        What causes high cholesterol and what to do about it.
       </li>
       <li>
        Why your hormones are an important factor in heart disease prevention.
       </li>
       <li>
        How your emotions and stress effect blood pressure.
       </li>
       <li>
        What commonly overlooked organ helps regulate your blood pressure (and it's NOT your heart).
       </li>
       <li>
        Five specific action steps to dramatically reduce you heart disease risk.
       </li>
      </ul>
      Items Mentioned in this podcast include:
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="http://www.youtube.com/watch?v=sZw7Rh9px7E" rel="noopener" target="_blank">
        What is heart disease -YouTube
       </a>
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="http://articles.mercola.com/sites/articles/archive/2011/10/01/can-statin-drug-cause-liver-and-heart-to-fail.aspx" rel="noopener" target="_blank">
        Statin Drugs May Cause Liver Injury and Heart Failure
       </a>
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       Thank you so much for checking out this episode of The Model Health Show. If you haven’t done so already, please take a minute and leave a quick rating and review of the show on Apple Podcast by clicking on the link below. It will help us to keep delivering life-changing information for you every week!
      </p>
      <p style="color: #333333; font-family: Georgia, 'Times New Roman', 'Bitstream Charter', Times, serif; font-size: 13px; line-height: 19px;">
       <a href="https://itunes.apple.com/us/podcast/the-model-health-show/id640246578" rel="noopener" target="_blank">
        CLICK HERE to leave a review for the show!
       </a>
      </p>
      ]]&gt;
     </content:encoded>
     <enclosure length="117041863" type="audio/mpeg" url="http://traffic.libsyn.com/themodelhealthshow/01-Heart_Health-AIR.mp3?dest-id=136195">
     </enclosure>
     <itunes:duration>
      48:15
     </itunes:duration>
     <itunes:explicit>
      clean
     </itunes:explicit>
     <itunes:keywords>
     </itunes:keywords>
     <itunes:subtitle>
     </itunes:subtitle>
     <itunes:summary>
      In this episode I break down the underlying cause of heart disease and share specific solutions for prevention.
     </itunes:summary>
     <itunes:episode>
      1
     </itunes:episode>
     <itunes:episodetype>
      full
     </itunes:episodetype>
     <itunes:author>
      Shawn Stevenson
     </itunes:author>
    </item>
   </channel>
  </rss>
 </body>
</html>
In [7]:
# After a bit of eyeballing we notice:
# - the meat of the feed, the episode information, is contained in the <item> blocks
# - episodes titles are kept in the <title> tags,
# - and the mp3 links are in the url attributes of enclosure tags.
# That's all we need.
In [8]:
list_of_titles = []
list_of_urls   = []

items = rss_soup.find_all(name = 'item')
for item in items :
    list_of_titles.append(item.title.text)
list_of_titles.reverse()
    
enclosures = rss_soup.find_all(name='enclosure')
for item in enclosures :
    list_of_urls.append(item.get('url'))
list_of_urls.reverse()
In [9]:
# Inspecting the haul - from earlier cursory look at the feed we know
#  the title and links to the first and last episodes:
print(str(list_of_titles[0]) + ' ' +str(list_of_urls[0]))
print(str(list_of_titles[-1])+ ' ' +str(list_of_urls[-1]))
TMHS 001: Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure http://traffic.libsyn.com/themodelhealthshow/01-Heart_Health-AIR.mp3?dest-id=136195
TMHS 353: Overcoming Our Secret Stressors & How To Stop Being Hangry - With Guests Sarah Fragoso & Dr. Brooke Kalanick http://traffic.libsyn.com/themodelhealthshow/353_-_Overcoming_Our_Secret_Stressors__How_To_Stop_Being_Hangry_-_With_Guests_Sarah_Fragoso__Dr._Brooke_Kalanick.mp3?dest-id=136195
In [10]:
# Now we make a folder to save the podcast to:
dir = "D:/Podcasts/TMHS/"
if not os.path.exists(dir) :
    os.makedirs(dir)
In [11]:
# Before attempting to download anything, let's see if there are
#  any symbols in the filenames that OS wouldn't play well with:
for title in list_of_titles[:5] :
    print(title)
print()
for title in list_of_titles[-5:] :
    print(title)
TMHS 001: Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure
TMHS 002: The 5 Biggest Weight Loss Mistakes
TMHS 003: The Truth About Breast Cancer - Share this with every woman you know!
TMHS 004: Help Me Sleep! - 21 Ways To Cure Your Sleep Problems (Part 1)
TMHS 005: Help Me Sleep! – 21 Ways To Cure Your Sleep Problems (Part 2)

TMHS 349: The Microbiome-Emotion Connection & The Truth About Antidepressants - With Guest Dr. Jillian Teta
TMHS 350: The Secret Life Of Fat - With Guest Dr. Sylvia Tara
TMHS 351: Nutrition Under the Sea:  Beat Cancer, Boost Mood, & Fight Obesity
TMHS 352: Self Assessment, Adjusting To Change, & Social Media Detox - With Guest Anne Stevenson
TMHS 353: Overcoming Our Secret Stressors & How To Stop Being Hangry - With Guests Sarah Fragoso & Dr. Brooke Kalanick
In [12]:
proper_titles = []
for title in list_of_titles :
    if (title.find('  '))  : title = title.replace('  ', ' ')
    if (title.find('"'))  : title = title.replace('"', '\'')
    if (title.find(':'))  : title = title.replace(':', ' -')
    if (title.find('\?')) : title = title.replace('\?', '')
    if (title.find('?'))  : title = title.replace('?', '')
    if (title.find('&'))  : title = title.replace('&', 'and')
    if (title.find('\&')) : title = title.replace('\&', 'and')
    proper_titles.append(title)
In [13]:
# Let's see how it looks now:
for title in proper_titles[:5] :
    print(title)
print()
for title in proper_titles[-5:] :
    print(title)
TMHS 001 - Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure
TMHS 002 - The 5 Biggest Weight Loss Mistakes
TMHS 003 - The Truth About Breast Cancer - Share this with every woman you know!
TMHS 004 - Help Me Sleep! - 21 Ways To Cure Your Sleep Problems (Part 1)
TMHS 005 - Help Me Sleep! – 21 Ways To Cure Your Sleep Problems (Part 2)

TMHS 349 - The Microbiome-Emotion Connection and The Truth About Antidepressants - With Guest Dr. Jillian Teta
TMHS 350 - The Secret Life Of Fat - With Guest Dr. Sylvia Tara
TMHS 351 - Nutrition Under the Sea - Beat Cancer, Boost Mood, and Fight Obesity
TMHS 352 - Self Assessment, Adjusting To Change, and Social Media Detox - With Guest Anne Stevenson
TMHS 353 - Overcoming Our Secret Stressors and How To Stop Being Hangry - With Guests Sarah Fragoso and Dr. Brooke Kalanick
In [14]:
# We also noticed that the extension(which ain't always .mp3 in the wild)
#  is trapped between a comma and a question mark. Let's capture it.
list_of_ext = []

for link in list_of_urls :
    list_of_ext.append(link.split('.')[-1].split('?')[0])
In [15]:
# Now, to make a function: 
def download_episode(filename, url) :
    r = requests.get(url)
    with open(filename, 'wb') as fout :
        fout.write(r.content)
In [16]:
# Let's check all the components needed to grab the first episode...
title = proper_titles[0]
ext   = list_of_ext[0]
url   = list_of_urls[0]

filename = dir+title+'.'+ext
print(filename)
print(url)
D:/Podcasts/TMHS/TMHS 001 - Natural Treatment for Heart Disease, High Cholesterol, and High Blood Pressure.mp3
http://traffic.libsyn.com/themodelhealthshow/01-Heart_Health-AIR.mp3?dest-id=136195
In [17]:
# and test it:
download_episode(filename, url)

image

In [18]:
# And now, to get a batch that would last me some time, and 
 # we'll speed the download up a little bit with processes:
from multiprocessing.pool import ThreadPool
In [38]:
ep_count = 30

filenames = []
for i in range(ep_count) :
    filenames.append(dir+proper_titles[i]+'.'+list_of_ext[i])

urls = []
for i in range(ep_count) :
    urls.append(list_of_urls[i])
    
arguments = [(filenames[i], urls[i]) for i in range(ep_count)]

ThreadPool(8).starmap(download_episode, arguments)
Out[38]:
[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

image

In [39]:
# And as a final touch, let's introduce a function that won't download episodes
#  we already had, so that we could just launch it every week to grab only the fresh ones.
def download_episode_smart(filename, url) :
    if(os.path.exists(filename)) :
        print("Already got " + filename + "\n")
    else :
        print("Downloading " + filename + "\n")
        r = requests.get(url)
        with open(filename, 'wb') as fout :
            fout.write(r.content)