Code Search for Developers
 
 
  

scrape_key_stats_skel.py from matplotlib at Krugle


Show scrape_key_stats_skel.py syntax highlighted

"""
Use urllib to download key statistics for several stock tickers; URL example pattern

http://finance.yahoo.com/q/ks?s=INTC
"""

import datetime, time  
import urllib
import os
import BeautifulSoup

XXX = None
def get_datadir():
    """
    return the data dir used by this program.  It should be
    data/key_stats.  use the "os" module to check for the directories existence and create it if
    necessary.  See os.exists, os.path.join and os.mkdir
    """
    XXX
    return datadir

def grab_data(tickers):
    """
    download the html file for each ticker in the list of tickers and
    put the output into datadir with a filename like
    'GOOG_key_stats.html'.  Look at urllib.urlretrieve for fetching
    the file over the web, and insert a time.sleep command in between
    each file grab to lower the burden on the server.

    Return value is a list of (ticker,
    pathname) where pathname is a path to the html file. 
    """
    datadir = get_datadir()
    datafiles = []
    for ticker in tickers:
        # make an output filename be creating a file like
        # "GOOG_key_stats.html" in the datadir
        fname = XXX

        # only download the file if it doesn't already exist.  Since these
        # stats can change daily, in real life we might want to put a date
        # stamp on the files too
        if not os.path.exists(fname):
            # build the URL from the format pattern and ticker and grab it with urllib
            # use time.sleep between grabs to be gentle on the server
            XXX
            print 'fetched %s into %s'%(ticker, fname)
        else:
            print 'already have', fname
        datafiles.append((ticker, fname))
    return datafiles


def convert(x):
    """
    The table data is all strings and we want to convert it to python
    datatypes as intelligently as possible.  For serios use, you would
    want converters depending on the column header, but where we'll
    just inspect the string and try and do something semi-intelligent.
    Eg, if it ends with '%', strip the '%' and return a float.  If it
    ends with 'M', strip the 'M' and multipl by 1e6 (likewise for 'K'
    and 'B').  Try to convert things that look like dates to
    python.date objects using time.strptime and datetime.date.  Try
    and convert to float using a try/except block.  If everything
    fails, just return the string
    """
    XXX
    return x

def parse_htmlfile(fname):
    """
    parse the key statistics html in fname and return a data
    dictionary.  The keys are the headers, and the values are the
    converted data items
    """

    # beautiful soup lets you filter html tags by their properties.  I
    # took a peak at one of the html sources and found the tags that
    # correspond to the tables, headers and dataitems we are interested
    # in.  Yahoo was nice enough to put "class" information in the tags
    # which makes this particularly easy.  We'll use the table props, the
    # headerprops and the dataprops to select out just the tables and
    # table elements we want
    tableprops  = {'class': 'yfnc_datamodoutline1'}
    headerprops = {'class': 'yfnc_tablehead1'}
    dataprops   = {'class': 'yfnc_tabledata1'}

    # create the beautiful soup instance with the html string
    soup = BeautifulSoup.BeautifulSoup(file(fname).read())
    datad = dict()
    for table in soup('table', **tableprops): # get all the data tables
        for row in table('tr'):               # iterate over all rows
            XXX # get the header and data items and store
            datad[header] = convert(data)  # call our all powerful convert function
    return datad


# a list of stock tickers to download and parse
tickers = 'INTC', 'MSFT', 'YHOO', 'GOOG', 'GE', 'WMT', 'CROX'

# we'll store the results in a dictionary of dictionaries.  tickerd is
# keyed off the ticker and points to a data dictionary returned by parse_fname
tickerd = dict()
for ticker, datafile in grab_data(tickers):
    tickerd[ticker] = parse_htmlfile(datafile)
    
# now let's pretty print the data for one ticker
ticker = 'INTC'
maxlen = max([len(header) for header in tickerd[ticker]])
for header, data in tickerd['INTC'].items():
    print '%s: %s'%(header.ljust(maxlen), data)




See more files for this project here

matplotlib

Matplotlib is a pure python plotting library with the goal of making\r\npublication quality plots using a syntax familiar to matlab users. \r\nThe library uses Numeric for handling large\r\ndata sets and supports a variety of output backends

Project homepage: http://sourceforge.net/projects/matplotlib
Programming language(s): C,C++,Python
License: other

  faces/
    faces_skel0.tgz
    fmatch_skel1.py
  fortran_wrap/
    Makefile
    fib3.f
    test.py
  distributions_skel.py
  erathostenes_skel.py
  fft_imdenoise_skel.py
  fit_synapse_skel.py
  fitting_skel.py
  montecarlo_pi_skel.py
  polyroots1d_skel.py
  qsort_skel.py
  quad_newton_skel.py
  recarray_demo_skel.py
  regress_demo_skel.py
  scrape_key_stats_skel.py
  shoot_skel.py
  spline_demo_skel.py
  stats_descriptives_skel.py
  stats_distributions_skel.py
  trapezoid_skel.py
  wallis_pi_skel.py
  wordfreqs_skel.py