preproc.py :  » Language-Interface » PyML » PyML-0.7.3 » PyML » preproc » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Language Interface » PyML 
PyML » PyML 0.7.3 » PyML » preproc » preproc.py

from PyML.base.pymlObject import PyMLobject
from PyML.utils import misc
import numpy
import math

import random

def pca(X, numcomp = None) :
    '''returns the matrix X as represented in the numcomp leading principal
    components
    if numcomp is None, all principal components are returned'''
    
    d = numpy.shape(X)[1]
    if numcomp is None :
        numcomp = d
        
    [u,s,v] = numpy.linalg.svd(X)
    v = numpy.transpose(v)
    v = v[:,:numcomp]
    print numpy.shape(X)
    return numpy.dot(X, v)


def centerColumns(X) :
    '''returns X - mean(X), where the mean is taken over the columns of X'''
    
    m = numpy.mean(X)

    n = numpy.shape(X)[0]
    
    return X - numpy.resize(m, (n,len(m)))


def centerRows(X) :

    return numpy.transpose(centerColumns(numpy.transpose(X)))


def standardizeColumns(X) :
    '''returns (X - mean(X)) / std(X) '''

    m = numpy.mean(X)
    std = numpy.std(X)

    n = numpy.shape(X)[0]

    return (X - numpy.resize(m, (n,len(m))))/numpy.resize(std, (n,len(std)))
    
def standardizeRows(X) :
    '''returns (X - mean(X)) / std(X) '''

    return numpy.transpose(standardizeColumns(numpy.transpose(X)))
    

def maxvar(X, numVariables) :
    '''returns the numVariables variables with the highest variance'''
    
    s = numpy.std(X)
    I = numpy.argsort(s)
    
    Xout = numpy.take(X, I[-numVariables:], 1)

    return Xout


def dmat(X) :
    '''returns the Euclidean distance-squared matrix'''

    K = numpy.matrixmultiply (X, numpy.transpose (X))
    n = numpy.shape(K)[0]
    D = numpy.zeros((n,n), numpy.float)

    for i in range(1, n-1) :
        for j in range(i+1, n) :
            D[i,j] = K[i,i] - 2 * K[i,j] + K[j,j]
            D[j,i] = D[i,j]
    
    return D


def norm2(x) :
    '''return the 2-norm of a vector given as a list or numpy array'''
    
    x = numpy.asarray(x)
    
    return math.sqrt(numpy.sum(x*x))


def normalizeNorm(X) :
    '''normalize each row of X to unit vectors'''

    (numRows, numCols) = numpy.shape(X)
    Xnorm = numpy.zeros((numRows, numCols), numpy.float)
    
    for i in range(numRows) :
        Xnorm[i] = X[i] / norm2(X[i])

    return Xnorm
    
class Correlator (object) :

    def __init__(self, data) :

        if type(data) == type('') :
            print 'file name:', data            
            data = datafunc.PyVectorDataSet(data, idColumn = 0, headerRow = True, hint = 'csv')

        self.data = data
        self.idDict = misc.list2dict(data.labels.patternID,
                                     range(len(data)))

        print numpy.shape(data.X)
        self.mean = numpy.mean(data.X, 1)
        self.std = std(data.X, 1)
        eps = 1e-5
        I = numpy.nonzero(numpy.less(self.std, eps))[0]
        print 'num zeros:',len(I)
        numpy.put(self.std, I, 1)
        
        self.numCorrelations = 10000
        correlations = numpy.zeros(self.numCorrelations, numpy.float)
        
        for i in range(self.numCorrelations) :
            i1 = random.randrange(0, len(data))
            i2 = random.randrange(0, len(data))
            correlations[i] = self._corrcoef(i1, i2)
        self.meanCorrelation = numpy.mean(correlations)
        self.numCorrelations = 1000        

    def corrcoef(self, id1, id2) :

        if id1 == id2 : return 1.0
        if type(id1) == type(1) :
            return self._corrcoef(id1, id2)
        if id1 not in self.idDict and id2 not in self.idDict :
            return self.meanCorrelation
        if id1 in self.idDict and id2 in self.idDict :
            return self._corrcoef(self.idDict[id1], self.idDict[id2])
        else :
            # we want to assume that id1 not in data:
            if id2 not in self.idDict :
                id1,id2 = id2,id1
            i2 = self.idDict[id2]
            correlations = numpy.zeros(self.numCorrelations, numpy.float)
            for i in range(self.numCorrelations) :
                i1 = random.randrange(0, len(self.data))
                correlations[i] = self._corrcoef(i1, i2)
            return numpy.mean(correlations)

    def _corrcoef(self, i1, i2) :
        
        return numpy.dot(self.data.X[i1] - self.mean[i1],
                           self.data.X[i2] - self.mean[i2]) / \
                           (len(self.data.X[i1]) * self.std[i1] * self.std[i2])
    
    
def corrcoef2(X) :
    '''compute the correlation between the rows of the matrix X
    more space efficient than numpy version'''
    
    (n,d) = numpy.shape(X)

    m = numpy.mean(X, 1)
    std = numpy.std(X, 1)

    K = numpy.ones((n,n), numpy.float)

    for i in range(0, n - 1) :
        for j in range(i + 1, n) :
            K[i][j] = numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j])
            K[j][i] = K[i][j]

    return K

def std(m,axis=0):
    """std(m,axis=0) returns the standard deviation along the given 
    dimension of m.  The result is unbiased with division by N-1.
    If m is of integer type returns a floating point answer.
    """
    x = numpy.asarray(m)
    n = float(x.shape[axis])
    mx = numpy.asarray(numpy.mean(x,axis))
    if axis < 0:
        axis = len(x.shape) + axis
    mx.shape = mx.shape[:axis] + (1,) + mx.shape[axis:]
    x = x - mx
    return numpy.sqrt(numpy.add.reduce(x*x,axis)/(n))

def corrcoef(X) :

    (n,d) = numpy.shape(X)

    Xn = standardizeRows(X)

    return numpy.dot(Xn, numpy.transpose(Xn)) / (d - 1)

def corrcoefij(X, i, j) :

    (n,d) = numpy.shape(X)

    m = numpy.mean(X, 1)
    std = numpy.std(X, 1)


    return numpy.dot(X[i] - m[i], X[j] - m[i]) / (d * std[i] * std[j])


class Standardizer (PyMLobject) :
    """
    class for performing feature normalization

    For each feature the Standardizer subtracts the feature's mean
    and divides by its standard deviation
    
    this rescaling is composed of two operations:
    
      1.  ``centering`` -- subtract from a feature its mean value;
          this is referred to as 'translation'; the translation attribute
          gives the value with which to translate each feature
      2.  ``scaling`` -- divide a feature by a scale, e.g. its standard deviation;
          the 'scale' attribute gives the value with which to scale each feature

    the 'train' method of the class computes the translation and scaling
    factors, and performs normalization of the training data
    the 'test' method uses values computed on the training data to normalize
    the test data.
    
    **caveat:**
    Beware of performing training multiple times on the same dataset:
    if a dataset has already been standardized, re-standardization
    will recompute mean and standard deviation, which will be approximately
    0 and 1 for each feature; subsequent application on test data will
    have no effect. Because of this an exception is raised if the user
    attempts to re-train an already trained Rescale object.
    """

    attributes = {'translate' : True,
                  'rescale' : True,
                  'translation' : None,
                  'scale' : None}

    def __init__(self, **args) :

        PyMLobject.__init__(self, args)

    def train(self, data, *options, **args) :

        if self.translation is not None or self.scale is not None :
            raise ValueError, 'object already trained'
        if self.translate :
            self.translation = data.mean()
        if self.rescale :
            self.scale = numpy.array(data.std())
            # need to avoid division by 0, so
            # scales that are equal to 0 are replaced with a value of 1
            eps = 1e-5
            I = numpy.nonzero(numpy.less(self.scale, eps))[0]
            numpy.put(self.scale, I, 1)
            # checking for nan:
            for i in range(len(self.scale)) :
                if self.scale[i] == 0 and self.scale[i] == 1 :
                    self.scale[i] = 1

        self.preproc(data)

    def preproc(self, data) :

        if self.translate :
            data.translate(self.translation)
        if self.rescale :
            data.scale(1.0 / self.scale)

    def test(self, data, *options, **args) :

        self.preproc(data)
    
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.