roc.py : » Language-Interface » PyML » PyML-0.7.3 » PyML » evaluators » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Language Interface » PyML
PyML » PyML 0.7.3 » PyML » evaluators » roc.py
"""
Module for computing ROC curves and AUC scores

Theoretical and pratical concepts from 
Fawcett, T.  ROC graphs: Notes and pratical considerations
for data mining researchers.  HPL-2003-4, 2003.

By Michael Hamilton, 2009
With edits by Asa Ben-Hur 2010
"""

import numpy
from bisect import bisect_left# for binary search
fromscipystats# for computing confidence intvls

def roc( dvals, labels, rocN=None, normalize=True ) :
    """
    Compute ROC curve coordinates and area

    - `dvals`  - a list with the decision values of the classifier
    - `labels` - list with class labels, \in {0, 1} 

    returns (FP coordinates, TP coordinates, AUC )
    """
    if rocN is not None and rocN < 1 :
        rocN = int(rocN * numpy.sum(numpy.not_equal(labels, 1)))

    TP = 0.0  # current number of true positives
    FP = 0.0  # current number of false positives
    
    fpc = [ 0.0 ]  # fp coordinates
    tpc = [ 0.0 ]  # tp coordinates
    dv_prev = -numpy.inf # previous decision value
    TP_prev = 0.0
    FP_prev = 0.0
    area = 0.0

    num_pos = labels.count( 1 )  # number of pos labels
    num_neg = labels.count( 0 ) # number of neg labels
    
    if num_pos == 0 or num_pos == len(labels) :
        raise ValueError, "There must be at least one example from each class"

    # sort decision values from highest to lowest
    indices = numpy.argsort( dvals )[ ::-1 ]
    
    for idx in indices:
        # increment associated TP/FP count
        if labels[ idx ] == 1:
            TP += 1.
        else:
            FP += 1.
            if rocN is not None and FP == rocN : 
                break
        # Average points with common decision values
        # by not adding a coordinate until all
        # have been processed
        if dvals[ idx ] != dv_prev:
            if len(fpc) > 0 and FP == fpc[-1] :
                tpc[-1] = TP
            else :
                fpc.append( FP  )
                tpc.append( TP  )
            dv_prev = dvals[ idx ]
            area += _trap_area( ( FP_prev, TP_prev ), ( FP, TP ) )
            FP_prev = FP
            TP_prev = TP

    #area += _trap_area( ( FP, TP ), ( FP_prev, TP_prev ) )
    #fpc.append( FP  )
    #tpc.append( TP )
    if normalize :
        fpc = [ float( x ) / FP for x in fpc ]
        if TP > 0:
            tpc = [ float( x ) / TP for x in tpc ]
        if area > 0:
            area /= ( TP * FP )

    return fpc, tpc, area
    
def roc_VA( folds, rocN=None, n_samps=100 ):
    """
    Compute ROC curve using vertical averaging

    `folds` - list of ( labels, dvals ) pairs where labels
              is a list of class labels and dvals are
              the decision values of the classifier
    """
    # return variables
    invl = 1.0 / n_samps # interval to sample FPR
    FPRs = numpy.arange( 0, (1+invl), invl )
    TPRs = [ ] # will contain assoc TPR avgs for FPRs
    # folds must be listified
    assert type( folds ) == type( [ ] )
    rocs = [ ] # list of roc tuples ( [FPR,TPR] ) for folds
    areas = [ ] # individual AUCs for each fold

    # calculate individual ROC curves for each fold
    for dvals,labels in folds:
        fpc, tpc, area = roc( dvals, labels, rocN )
        rocs.append( (fpc, tpc) )
        areas.append( area )

    for fpr in FPRs:
        # accumulate TPRs for current fpr over all folds
        tpr_folds = [ ] 
        # fix FPR and accumulate (interpolated) TPRs
        for fpc, tpc in rocs:
            tpr_folds.append( _tpr_for_fpr( fpc, tpc, fpr ))
        # average tprs and append
        TPRs.append( numpy.mean( tpr_folds ) )
    
    return FPRs, numpy.array( TPRs ), numpy.mean( areas )


def _tpr_for_fpr( fpc, tpc, fpr ):
    """
    Returns the (estimated) tpr for the given fpr for 
    the given false positive/true positive coordinates
    from an ROC curve.

    `fpc` - False positive coordinates from ROC curve
    `tpc` - True positive coordinates from ROC curve
    """
    
    # take advantage of monotonic property of ROC curves
    # and search for fpr in O( log n ) time
    idx = bisect_left( fpc, fpr )
    
    # if exact match, then return
    if fpc[ idx ] == fpr:
        return tpc[ idx ]

    else:
        # check if idx is last index of fpc
        #if idx == len( fpc ) - 1:
        #    return tpc[ idx ]

        # check if the neighboring fprs are identical
        #elif fpc[ idx ] == fpc[ idx + 1 ]:
            # return the average of the tprs 
        #    return ( tpc[ idx ] + tpc[ idx+1 ] ) / 2.0
        # otherwise, interpolate the tpr
        return _interpolate( ( fpc[ idx-1 ], tpc[ idx-1 ] ),
                             ( fpc[ idx ], tpc[ idx ] ),
                             fpr )

def _interpolate( p1, p2, x ):
    """
    Interpolate the value of f( x ).

    `p1` - 1st interpolation point (x1, y1)
    `p1` - 2nd interpolation point (x2, y2)
    `x`  - the value to interpolate
    """
    return p1[ 1 ] + _slope( p1, p2 ) * ( x - p1[ 0 ] )

def _trap_area( p1, p2 ):
    """
    Calculate the area of the trapezoid defined by points
    p1 and p2
    
    `p1` - left side of the trapezoid
    `p2` - right side of the trapezoid
    """
    base = abs( p2[ 0 ] - p1[ 0 ] )
    avg_ht = ( p1[ 1 ] + p2[ 1 ] ) / 2.0

    return base * avg_ht

def _slope( p1, p2 ):
    """
    Calculates the slope of the line defined by
    points p1 and p2
    """
    delta_x = p2[ 0 ] - p1[ 0 ]
    delta_y = p2[ 1 ] - p1[ 1 ]
    
    # if infinite slope, scream
    if delta_x == 0: raise( "Infinite slope" )

    return float( delta_y ) / delta_x
    
def _conf( x, p=0.95 ):
    """
    Compute half the length of the confidience interval of 
    x for confidence p, 0 < p < 1.

    `x` - sample array
    `p` - confidence level
    """
    
    p = p + ( 1 - p ) / 2.0 # for 2-tailed t-distribution
    std_err = stats.stderr( x )
    df = len( x ) - 1
    t = stats.t.ppf( p, df )
    
    return t * std_err


def plotROC(rocFP, rocTP, fileName = None, **args) :
    """plot the ROC curve from a given Results (or Results-like) object

    :Parameters:
      - `res` - Results (or Container object that was made by saving a a
        Results object (note that if you have a Results object you can
        use this function as a method so there is no need to supply this
        argument).
      - `fileName` - optional argument - if given, the roc curve is saved
        in the given file name.  The format is determined by the extension.
        Supported extensions: .eps, .png, .svg
    
    :Keywords:
      - `normalize` - whether to normalize the ROC curve (default: True)
      - `plotStr` - which string to pass to matplotlib's plot function
        default: 'ob'
      - `axis` - redefine the figure axes; takes a list of the form
        [xmin,xmax,ymin,ymax]
      - `show` - whether to show the ROC curve (default: True)
        useful when you just want to save the curve to a file.
        The use of Some file formats automatically sets this to False
        (e.g. svg files).  This relates to quirks of matplotlib.
    """

    if 'show' in args :
        show = args['show']
    else :
        show = True
    if 'plotStr' in args :
        plotStr = args['plotStr']
    else :
        plotStr = 'ob'
    rocNormalize = True
    if 'normalize' in args :
        rocNormalize = args['normalize']

    numPoints = 200
    if 'numPoints' in args :
        numPoints = args['numPoints']
        
    stride = int(max(1, float(len(rocTP)) / float(numPoints)))

    if stride > 1 :
        rocTP = [rocTP[i] for i in range(0,len(rocTP), stride)]
        rocFP = [rocFP[i] for i in range(0,len(rocFP), stride)]        
        
    import matplotlib
    if fileName is not None and fileName.find('.svg') > 0 :
        matplotlib.use('SVG')
        show = False
    if fileName is not None and fileName.find('.eps') > 0 :
        matplotlib.use('PS')
        show = False

    from matplotlib import pylab
    lines = pylab.plot(rocFP, rocTP, plotStr,
                        markersize = 8, linewidth = 3)
    if rocNormalize :
        pylab.xlabel('False positive rate', fontsize = 18)
        pylab.ylabel('True positive rate', fontsize = 18)
    else :
        pylab.xlabel('False positives', fontsize = 18)
        pylab.ylabel('True positives', fontsize = 18)
    if rocNormalize :
        pylab.axis([0, 1, 0, 1])
    if 'axis' in args :
        pylab.axis(args['axis'])
    print fileName
    if fileName is not None :
        pylab.savefig(fileName)
    if show :
        pylab.show()

def plotROCs(resList, descriptions = None, fileName = None, **args) :

    """
    plot multiple ROC curves.

    :Parameters:
      - `resList` - a list or dictionary of Result or Result-like objects
      - `descriptions` - text for the legend (a list the size of resList).
        A legend is not shown if this parameter is not given
        In the case of a dictionary input the description for the legend is
        taken from the dictionary keys.
      - `fileName` - if given, a file to save the figure in

    :Keywords:
      - `legendLoc` - the position of the legend -- an integer between 0 and 9;
        see the matplotlib documentation for details
      - `plotStrings` - a list of matlab style plotting string to send to the
        plotROC function (instead of the plotString keyword of plotROC)
      - `other keywords` - keywords of the plotROC function 
    """

    if type(resList) == type([]) and type(resList[0]) == type('') :
        fileNames = resList
        resList = []
        for fileName in fileNames :
            resList.append(myio.load(fileName))
        if descriptions is None :
            descriptions = []
            for fileName in fileNames :
                descriptions.append(os.path.splitext(fileName)[0])

    import matplotlib
    show = True
    if fileName is not None and fileName.find('.svg') > 0 :
        matplotlib.use('SVG')
        show = False
    if fileName is not None and fileName.find('.eps') > 0 :
        matplotlib.use('PS')
        show = False
                
    from matplotlib import pylab
    args['show'] = False
    
    plotStrings = ['bo', 'k^', 'rv', 'g<', 'm>', 'k<']
    plotStrings = ['b-', 'k--', 'r-', 'g-.', 'm-', 'k:', 'b-', 'r-', 'g-']
    #plotStrings = ['b:', 'k-.', 'b-', 'g-', 'm-', 'k-', 'b-', 'r-', 'g-']
    if 'plotStrings' in args :
        plotStrings = args['plotStrings']
    if type(resList) == type([]) :
        for i in range(len(resList)) :
            print i
            args['plotStr'] = plotStrings[i]
            plotROC(resList[i], **args)
    else :
        if descriptions is None :
            descriptions = [key for key in resList]
        i = 0
        for key in resList :
            args['plotStr'] = plotStrings[i]
            plotROC(resList[key], **args)
            i+=1

    if descriptions is not None :
        legendLoc = 'best'
        if 'legendLoc' in args :
            legendLoc = args['legendLoc']
        pylab.legend(descriptions, loc = legendLoc)
        leg = pylab.gca().get_legend()
        ltext  = leg.get_texts()  # all the text.Text instance in the legend
        llines = leg.get_lines()  # all the lines.Line2D instance in the legend
        frame  = leg.get_frame()  # the Rectangle instance surrounding the legend

        #frame.set_facecolor(0.80)     # set the frame face color to light gray
        for obj in ltext :
            obj.set_size(14)
        #leg.draw_frame(False)         # don't draw the legend frame

    print fileName
    
    if fileName is not None :
        pylab.savefig(fileName)
    if show :
        pylab.show()


def test():
    # Testing
    import numpy.random as rand
    import pylab

    ############## Make synthetic data #################

    # Simulate protein binding site by having few positive
    # class examples 
    folds = [ ]
    labs = [ -1 ] * 500
    labs.extend( [ 1 ] * 20 )
    N = 40  # number of sequences
    
    # set means of decision values
    mu_n = -1
    mu_p = 1
    
    # draw decision values from normal distribution
    # for each class
    for i in xrange( N ):
        preds = rand.normal( mu_n, 1, 500 ).tolist()
        preds.extend( rand.normal( mu_p, 1, 20 ).tolist() )
        folds.append( ( labs[ : ], preds ) )

    # Calculate average ROC curve
    fpr, tpr, tpr_ci, area, area_ci = roc_VA( folds )

    # Print AUC with confidence interval
    print "area: %f, 95%% CI: (%f, %f)" % ( area, area-area_ci, area+area_ci )

    ####### Plot ROC curve with 95% confidence band  #######
    fig = pylab.figure( )
    ax = fig.add_subplot( 111 )
    # fill upper band
    xs, ys = pylab.poly_between( fpr, tpr, numpy.array( tpr ) + numpy.array( tpr_ci ) ) 
    ax.fill( xs, ys, 'r' )
    # fill lower band
    xs, ys = pylab.poly_between( fpr, tpr, numpy.array( tpr ) - numpy.array( tpr_ci ) ) 
    ax.fill( xs, ys, 'r' )
    # label axes and main
    pylab.xlabel( "FPR" )
    pylab.ylabel( "TPR" )
    pylab.title( "Example ROC curve with 95% confidence band" )
    ax.plot( numpy.arange( 0, 1.1, .1 ), numpy.arange( 0, 1.1, .1  ), 'k--')
    pylab.xlim( -.01, 1.01 )
    pylab.ylim( -.01, 1.01 )
    pylab.show( )
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.