Code Search for Developers
 
 
  

scrape_key_stats.py from matplotlib at Krugle


Show scrape_key_stats.py syntax highlighted

"""
Use urllib to download key statistics for several stock tickers; URL
example pattern

http://finance.yahoo.com/q/ks?s=INTC
"""

import datetime, time  
import urllib
import os
import BeautifulSoup


def get_datadir():
    """
    return the data dir used by this program.  It should be
    data/key_stats.  use the os module to check for the directories
    existence and create it if necessary.  See os.exists, os.path.join
    and os.mkdir
    """
    if not os.path.exists('data'):
        os.mkdir('data')

    datadir = os.path.join('data', 'key_stats')
    if not os.path.exists(datadir):
        os.mkdir(datadir)
    return datadir

def grab_data(tickers):
    """
    download the html file for each ticker in the list of tickers and
    put the output into datadir with a filename like
    'GOOG_key_stats.html'.  Look at urllib.urlretrieve for fetching
    the file over the web, and insert a time.sleep command in between
    each file grab to lower the burden on the server.

    Return value is a list of (ticker,
    pathname) where pathname is a path to the html file. 
    """
    datadir = get_datadir()
    datafiles = []
    for ticker in tickers:
        # make an output filename be creating a file like
        # "GOOG_key_stats.html" in the datadir
        fname = os.path.join(datadir, ticker+'_key_stats.html')

        # only download the file if it doesn't already exist.  Since these
        # stats can change daily, in real life we might want to put a date
        # stamp on the files too
        if not os.path.exists(fname):
            # build the URL from the format pattern and ticker and grab it with urllib
            url = 'http://finance.yahoo.com/q/ks?s=' + ticker
            urllib.urlretrieve(url, fname)
            print 'fetched %s into %s'%(ticker, fname)
            # use time.sleep between grabs to be gentle on the server
            time.sleep(0.5)
        else:
            print 'already have', fname
        datafiles.append((ticker, fname))
    return datafiles


def convert(x):
    """
    The table data is all strings and we want to convert it to python
    datatypes as intelligently as possible.  For serios use, you would
    want converters depending on the column header, but where we'll
    just inspect the string and try and do something semi-intelligent.
    Eg, if it ends with '%', strip the '%' and return a float.  If it
    ends with 'M', strip the 'M' and multipl by 1e6 (likewise for 'K'
    and 'B').  Try to convert things that look like dates to
    python.date objects using time.strptime and datetime.date.  Try
    and convert to float using a try/except block.  If everything
    fails, just return the string
    """
    if x=='N/A': return None
    elif x.endswith('%'): return float(x[:-1])
    elif x.endswith('B'): return float(x[:-1])*1e9
    elif x.endswith('M'): return float(x[:-1])*1e6
    elif x.endswith('K'): return float(x[:-1])*1e3
    else:
        # try to convert to float
        try: return float(x)
        except ValueError: pass

        # try to convert to date
        try: y,m,d = time.strptime(x, '%d-%b-%y')[:3]
        except ValueError: pass
        else: return datetime.date(y,m,d)

    return x

def parse_htmlfile(fname):
    """
    parse the key statistics html in fname and return a data
    dictionary.  The keys are the headers, and the values are the
    converted data items
    """

    # beautiful soup lets you filter html tags by their properties.  I
    # took a peak at one of the html sources and found the tags that
    # correspond to the tables, headers and dataitems we are interested
    # in.  Yahoo was nice enough to put "class" information in the tags
    # which makes this particularly easy.  We'll use the table props, the
    # headerprops and the dataprops to select out just the tables and
    # table elements we want
    tableprops  = {'class': 'yfnc_datamodoutline1'}
    headerprops = {'class': 'yfnc_tablehead1'}
    dataprops   = {'class': 'yfnc_tabledata1'}

    # create the beautiful soup instance with the html string
    soup = BeautifulSoup.BeautifulSoup(file(fname).read())
    datad = dict()
    for table in soup('table', **tableprops): # get all the data tables
        for row in table('tr'):               # iterate over all rows
            header = row('td', **headerprops) # look for td element with the header tags
            data = row('td', **dataprops)     # look for td element with the data tags
            if len(header)==1 and len(data)==1:  # there should be exactly one header and data item
                header = str(header[0].contents[0]).strip() # convert it to a string and strip whitespace
                if header.endswith(':'): header = header[:-1]
                data = str(data[0].contents[0]).strip()
                datad[header] = convert(data)  # call our all powerful convert function
    return datad


# a list of stock tickers to download and parse
tickers = 'INTC', 'MSFT', 'YHOO', 'GOOG', 'GE', 'WMT', 'CROX'

# we'll store the results in a dictionary of dictionaries.  tickerd is
# keyed off the ticker and points to a data dictionary returned by parse_fname
tickerd = dict()
for ticker, datafile in grab_data(tickers):
    tickerd[ticker] = parse_htmlfile(datafile)
    
# now let's pretty print the data for one ticker
ticker = 'INTC'
maxlen = max([len(header) for header in tickerd[ticker]])
for header, data in tickerd['INTC'].items():
    print '%s: %s'%(header.ljust(maxlen), data)




See more files for this project here

matplotlib

Matplotlib is a pure python plotting library with the goal of making\r\npublication quality plots using a syntax familiar to matlab users. \r\nThe library uses Numeric for handling large\r\ndata sets and supports a variety of output backends

Project homepage: http://sourceforge.net/projects/matplotlib
Programming language(s): C,C++,Python
License: other

  data/
    key_stats/
      CROX_key_stats.html
      GE_key_stats.html
      GOOG_key_stats.html
      INTC_key_stats.html
      MSFT_key_stats.html
      WMT_key_stats.html
      YHOO_key_stats.html
    HISTORY.gz
    ge.csv
    hsales.dat
    monthly_sunspots.dat
    moonlanding.jpg
    nm560.dat
    synapse_data.dat
    synapse_times.dat
  extras/
    fft_demo.py
    spec_interp.py
    steinman_interp.py
    weave_examples.py
  faces/
    data_test/
      face10tn.pcx
      face11tn.pcx
      face12tn.pcx
      face13tn.pcx
      face14tn.pcx
      face15tn.pcx
      face16tn.pcx
      face17tn.pcx
      face18tn.pcx
      face19tn.pcx
      face1tn.pcx
      face20tn.pcx
      face21tn.pcx
      face22tn.pcx
      face23tn.pcx
      face24tn.pcx
      face25tn.pcx
      face26tn.pcx
      face27tn.pcx
      face28tn.pcx
      face29tn.pcx
      face2tn.pcx
      face30tn.pcx
      face3tn.pcx
      face4tn.pcx
      face5tn.pcx
      face6tn.pcx
      face7tn.pcx
      face8tn.pcx
      face9tn.pcx
    data_train/
      face10n.pcx
      face11n.pcx
      face12n.pcx
      face13n.pcx
      face14n.pcx
      face15n.pcx
      face16n.pcx
      face17n.pcx
      face18n.pcx
      face19n.pcx
      face1n.pcx
      face20n.pcx
      face21n.pcx
      face22n.pcx
      face23n.pcx
      face24n.pcx
      face25n.pcx
      face26n.pcx
      face27n.pcx
      face28n.pcx
      face29n.pcx
      face2n.pcx
      face30n.pcx
      face31n.pcx
      face32n.pcx
      face33n.pcx
      face34n.pcx
      face35n.pcx
      face36n.pcx
      face37n.pcx
      face38n.pcx
      face39n.pcx
      face3n.pcx
      face40n.pcx
      face41n.pcx
      face42n.pcx
      face43n.pcx
      face44n.pcx
      face45n.pcx
      face46n.pcx
      face47n.pcx
      face48n.pcx
      face49n.pcx
      face4n.pcx
      face50n.pcx
      face51n.pcx
      face52n.pcx
      face53n.pcx
      face54n.pcx
      face55n.pcx
      face56n.pcx
      face57n.pcx
      face58n.pcx
      face59n.pcx
      face5n.pcx
      face60n.pcx
      face61n.pcx
      face62n.pcx
      face63n.pcx
      face64n.pcx
      face65n.pcx
      face6n.pcx
      face7n.pcx
      face8n.pcx
      face9n.pcx
    faces.py
    fmatch.py
    imatch.py
    imatch2.py
    imtools.py
    test_imatch.py
  logistic/
    sethna_ori/
    __init__.py
    exercise01.py
    exercise02.py
    maplib.py
    maplib.pyc
  numpy_wrap/
    f2py/
    pyrex/
    swig/
  schrodinger/
    Schrodinger_FDTD.pdf
    schrod_fdtd.py
  skel/
    faces/
    fortran_wrap/
    distributions_skel.py
    erathostenes_skel.py
    fft_imdenoise_skel.py
    fit_synapse_skel.py
    fitting_skel.py
    montecarlo_pi_skel.py
    polyroots1d_skel.py
    qsort_skel.py
    quad_newton_skel.py
    recarray_demo_skel.py
    regress_demo_skel.py
    scrape_key_stats_skel.py
    shoot_skel.py
    spline_demo_skel.py
    stats_descriptives_skel.py
    stats_distributions_skel.py
    trapezoid_skel.py
    wallis_pi_skel.py
    wordfreqs_skel.py
  visual/
    bounce.py
    shoot.py
    shoot_t.py
    toroid_drag.py
  BeautifulSoup.py
  __init__.py
  bessel.py
  distributions.py
  erathostenes.py
  erathostenes_fperez.py
  erathostenes_list.py
  erathostenes_set.py
  fft_imdenoise.py
  fit_synapse.py
  fitting.py
  getbibtex.py
  lsys.py
  montecarlo_pi.py
  numpy-blitz_1000.png
  numpy-blitz_300.png
  numpy-blitz_500.png
  numpy_slicing.py
  polyroots1d.py
  qsort.py
  quad_newton.py
  recarray_demo.py
  regress.py
  regress_demo.py
  scrape_key_stats.py
  spline_demo.py
  stats_descriptives.py
  stats_distributions.py
  test.ipy
  trapezoid.py
  wallis_pi.py
  weave_blitz.py
  weave_blitz0.py
  weave_blitz_comp.png
  weave_examples_simple.py
  weave_exercises.py
  wordfreqs.py