{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Basic HTML Parser" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import HTMLParser\n", "import urllib\n", "\n", "urlText = []\n", "\n", "#Define HTML Parser\n", "class parseText(HTMLParser.HTMLParser):\n", " \n", " def handle_data(self, data):\n", " if data != '\\n':\n", " urlText.append(data)\n", " \n", "\n", "#Create instance of HTML parser\n", "lParser = parseText()\n", "\n", "thisurl = \"http://www-rohan.sdsu.edu/~gawron/index.html\"\n", "#Feed HTML file into parser\n", "html_gook = urllib.urlopen(thisurl).read()\n", "lParser.feed(html_gook)\n", "lParser.close()\n", "#for item in urlText:\n", "# print item\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Newsfeeds" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import os\n", "import sys\n", "import feedparser\n", "from bs4 import BeautifulSoup\n", "#from bs4 import get_text as clean_html\n", "import urllib\n", "#import HTMLParser\n", "\n", "\n", "def parseHtml(html):\n", " return BeautifulSoup(html).contents\n", "\n", "def get_feedparser_feed(FEED_URL):\n", "\n", " fp = feedparser.parse(FEED_URL)\n", "\n", " if fp and fp.entries and fp.entries[0]:\n", " print \"Fetched %s entries from '%s'\" % (len(fp.entries), fp.feed.title)\n", " else:\n", " print 'No entries parseed!'\n", " sys.exit()\n", " return fp\n", "\n", " ## TODO: Look at fp.status for a 404.\n", " ## Thhere may be page content but the page you asked for may be gonbe.\n", " ## look at fp.feed.summary for a lot of URl\n", " \n", "def get_blog_posts(fp):\n", " global feed_dict, blog_posts\n", " \n", " blog_posts = []\n", " for e in fp.entries:\n", " try:\n", " content = e.content[0]\n", " except AttributeError:\n", " content = e.summary_detail\n", " feed_dict = {'title': e.title,\n", " 'content': parseHtml(content.value),\n", " 'link': e.links[0].href}\n", " blog_posts.append(feed_dict)\n", " return blog_posts\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We look at what a real newsfeed looks like. Here's [the OReilly press newsfeed.](http://feeds.feedburner.com/oreilly/radar/atom)." ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fetched 15 entries from 'O'Reilly Radar - Insight, analysis, and research about emerging technologies'\n" ] } ], "source": [ "FEED_URL = 'http://feeds.feedburner.com/oreilly/radar/atom'\n", "fp = get_feedparser_feed(FEED_URL)\n", "blog_posts = get_blog_posts(fp)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "{'content': [
) and some can't ().\n", " Table and list tags have a natural nesting order. For instance,
tags go inside tags, not the other way around.\n", " The contents of a