Reads CSV (Comma Separated Value) files : CSV File « Development Class


  

/*------------------------------------------------------------------------------

Name:      CSVReader.java

Project:   jutils.org

Comment:   Reads CSV (Comma Separated Value) files

Version:   $Id: CSVReader.java,v 1.1 2004/04/07 07:40:45 laurent Exp $

Author:    Roedy Green roedy@mindprod.com, Heinrich Goetzger goetzger@gmx.net

------------------------------------------------------------------------------*/





import java.util.Vector;

import java.io.BufferedReader;

import java.io.EOFException;

import java.io.FileReader;

import java.io.IOException;

import java.io.Reader;



/**

 * Reads CSV (Comma Separated Value) files.

 *

 * This format is mostly used my Microsoft Word and Excel.

 * Fields are separated by commas, and enclosed in

 * quotes if they contain commas or quotes.

 * Embedded quotes are doubled.

 * Embedded spaces do not normally require surrounding quotes.

 * The last field on the line is not followed by a comma.

 * Null fields are represented by two commas in a row.

 * We ignore leading and trailing spaces on fields, even inside quotes.

 *

 * @author copyright (c) 2002 Roedy Green  Canadian Mind Products

 * Roedy posted this code on Newsgroups:comp.lang.java.programmer on 27th March 2002.

 *

 * Heinrich added some stuff like comment ability and linewise working.

 *

 */



public class CSVReader {

   /**

    * Constructor

    *

    * @param r     input Reader source of CSV Fields to read.

    * @param separator

    *               field separator character, usually ',' in North America,

    *               ';' in Europe and sometimes '\t' for tab.

    */

   public CSVReader (Reader r, char separator) {

      /* convert Reader to BufferedReader if necessary */

      if ( r instanceof BufferedReader ) {

         this.r = (BufferedReader) r;

      } else {

         this.r = new BufferedReader(r);

      }

      this.separator = separator;

   } // end of CSVReader



   /**

    * Constructor with default field separator ','.

    *

    * @param r     input Reader source of CSV Fields to read.

    */

   public CSVReader (Reader r) {

      /* convert Reader to BufferedReader if necessary */

      if ( r instanceof BufferedReader ) {

         this.r = (BufferedReader) r;

      } else {

         this.r = new BufferedReader(r);

      }

      this.separator = ',';

   } // end of CSVReader



   private static final boolean debugging = true;



   /**

    * Reader source of the CSV fields to be read.

    */

   private BufferedReader r;



   /*

   * field separator character, usually ',' in North America,

   * ';' in Europe and sometimes '\t' for tab.

   */

   private char separator;



   /**

    * category of end of line char.

    */

   private static final int EOL = 0;



   /**

    * category of ordinary character

    */

   private static final int ORDINARY = 1;



   /**

    * categotory of the quote mark "

    */

   private static final int QUOTE = 2;



   /**

    * category of the separator, e.g. comma, semicolon

    * or tab.

    */

   private static final int SEPARATOR = 3;



   /**

    * category of characters treated as white space.

    */

   private static final int WHITESPACE = 4;



   /**

    * categorise a character for the finite state machine.

    *

    * @param c      the character to categorise

    * @return integer representing the character's category.

    */

   private int categorise ( char c ) {

      switch ( c ) {

         case ' ':

         case '\r':

         case 0xff:

            return WHITESPACE;

//         case ';':

//         case '!':

         case '#':

            //return EOL;

         case '\n':

            return EOL; /* artificially applied to end of line */

         case '\"':

            return QUOTE;

         default:

            if (c == separator) {

               /* dynamically determined so can't use as case label */

               return SEPARATOR;

            } else if ( '!' <= c && c <= '~' ) {

               /* do our tests in crafted order, hoping for an early return */

               return ORDINARY;

            } else if ( 0x00 <= c && c <= 0x20 ) {

               return WHITESPACE;

            } else if ( Character.isWhitespace(c) ) {

               return WHITESPACE;

            } else {

               return ORDINARY;

            }

      } // end of switch

   } // end of categorise





   /**

    * parser: We are in blanks before the field.

    */

   private static final int SEEKINGSTART = 0;



   /**

    * parser: We are in the middle of an ordinary field.

    */

   private static final int INPLAIN = 1;



   /**

    * parser: e are in middle of field surrounded in quotes.

    */

   private static final int INQUOTED = 2;



   /**

    * parser: We have just hit a quote, might be doubled

    * or might be last one.

    */

   private static final int AFTERENDQUOTE = 3;



   /**

   * parser: We are in blanks after the field looking for the separator

   */

   private static final int SKIPPINGTAIL = 4;



   /**

    * state of the parser's finite state automaton.

    */



   /**

    * The line we are parsing.

    * null means none read yet.

    * Line contains unprocessed chars. Processed ones are removed.

    */

   private String line = null;



   /**

    * How many lines we have read so far.

    * Used in error messages.

    */

   private int lineCount = 0;



   public String[] getLine() {

      Vector lineArray = new Vector();

      String token = null;

      String returnArray [] = null;



      // reading values from line until null comes



      try {

         while (lineArray.size() == 0) {

            while ( (token = get() ) != null ) {

               lineArray.add(token);

            } // end of while

         } // end of while

      } catch (EOFException e) {

         return null;

      } catch (IOException e) {

      }



      returnArray = new String[lineArray.size()];



      for(int ii=0; ii < lineArray.size(); ii++) {

         returnArray[ii] = lineArray.elementAt(ii).toString();

      } // end of for



      return returnArray;

   }



   /**

    * Read one field from the CSV file

    *

    * @return String value, even if the field is numeric.  Surrounded

    *         and embedded double quotes are stripped.

    *         possibly "".  null means end of line.

    *

    * @exception EOFException

    *                   at end of file after all the fields have

    *                   been read.

    *

    * @exception IOException

    *                   Some problem reading the file, possibly malformed data.

    */

   private String get() throws EOFException, IOException {

      StringBuffer field = new StringBuffer(50);

      /* we implement the parser as a finite state automaton with five states. */

      readLine();



      int state = SEEKINGSTART; /* start seeking, even if partway through a line */

      /* don't need to maintain state between fields. */



      /* loop for each char in the line to find a field */

      /* guaranteed to leave early by hitting EOL */

      for ( int i=0; i<line.length(); i++ ) {

         char c = line.charAt(i);

         int category = categorise(c);

         switch ( state ) {

            case SEEKINGSTART: {

               /* in blanks before field */

               switch ( category ) {

                  case WHITESPACE:

                     /* ignore */

                     break;

                  case QUOTE:

                     state = INQUOTED;

                     break;

                  case SEPARATOR:

                     /* end of empty field */

                     line = line.substring(i+1);

                     return "";

                  case EOL:

                     /* end of line */

                     line = null;

                     return null;

                  case ORDINARY:

                     field.append(c);

                     state = INPLAIN;

                     break;

               }

               break;

            } // end of SEEKINGSTART

            case INPLAIN: {

               /* in middle of ordinary field */

               switch ( category ) {

                  case QUOTE:

                     throw new IOException("Malformed CSV stream. Missing quote at start of field on line " + lineCount);

                  case SEPARATOR:

                     /* done */

                     line = line.substring(i+1);

                     return field.toString().trim();

                  case EOL:

                     line = line.substring(i); /* push EOL back */

                     return field.toString().trim();

                  case WHITESPACE:

                     field.append(' ');

                     break;

                  case ORDINARY:

                     field.append(c);

                     break;

               }

               break;

            } // end of INPLAIN

            case INQUOTED: {

               /* in middle of field surrounded in quotes */

               switch ( category ) {

                  case QUOTE:

                     state = AFTERENDQUOTE;

                     break;

                  case EOL:

                     throw new IOException ("Malformed CSV stream. Missing quote after field on line "+lineCount);

                  case WHITESPACE:

                     field.append(' ');

                     break;

                  case SEPARATOR:

                  case ORDINARY:

                     field.append(c);

                     break;

               }

                break;

            } // end of INQUOTED

            case AFTERENDQUOTE: {

               /* In situation like this "xxx" which may

                  turn out to be xxx""xxx" or "xxx",

                  We find out here. */

               switch ( category ) {

                     case QUOTE:

                        field.append(c);

                        state = INQUOTED;

                        break;

                     case SEPARATOR :

                        /* we are done.*/

                        line = line.substring(i+1);

                        return field.toString().trim();

                     case EOL:

                        line = line.substring(i); /* push back eol */

                        return field.toString().trim();

                     case WHITESPACE:

                        /* ignore trailing spaces up to separator */

                        state = SKIPPINGTAIL;

                        break;

                     case ORDINARY:

                        throw new IOException("Malformed CSV stream, missing separator after field on line " + lineCount);

               }

               break;

            } // end of AFTERENDQUOTE

            case SKIPPINGTAIL: {

               /* in spaces after field seeking separator */

               switch ( category ) {

                  case SEPARATOR :

                     /* we are done.*/

                     line = line.substring(i+1);

                     return field.toString().trim();

                  case EOL:

                     line = line.substring(i); /* push back eol */

                     return field.toString().trim();

                  case WHITESPACE:

                     /* ignore trailing spaces up to separator */

                     break;

                  case QUOTE:

                  case ORDINARY:

                     throw new IOException("Malformed CSV stream, missing separator after field on line " + lineCount);

               } // end of switch

               break;

            } // end of SKIPPINGTAIL

         } // end switch(state)

      } // end for

      throw new IOException("Program logic bug. Should not reach here. Processing line " + lineCount);

   } // end get



   /**

    * Make sure a line is available for parsing.

    * Does nothing if there already is one.

    *

    * @exception EOFException

    */

   private void readLine() throws EOFException, IOException {

      if ( line == null ) {

         line = r.readLine();  /* this strips platform specific line ending */

         if ( line == null ) {

                /* null means EOF, yet another inconsistent Java convention. */

            throw new EOFException();

         } else {

            line += '\n'; /* apply standard line end for parser to find */

            lineCount++;

         }

      }

   } // end of readLine





   /**

    * Skip over fields you don't want to process.

    *

    * @param fields How many field you want to bypass reading.

    *               The newline counts as one field.

    * @exception EOFException

    *                   at end of file after all the fields have

    *                   been read.

    * @exception IOException

    *                   Some problem reading the file, possibly malformed data.

    */

   public void skip(int fields) throws EOFException, IOException {

      if ( fields <= 0 ) {

         return;

      }

      for ( int i=0; i<fields; i++ ) {

         // throw results away

         get();

      }

   } // end of skip



   /**

    * Skip over remaining fields on this line you don't want to process.

    *

    * @exception EOFException

    *                   at end of file after all the fields have

    *                   been read.

    * @exception IOException

    *                   Some problem reading the file, possibly malformed data.

    */

   public void skipToNextLine() throws EOFException, IOException {

      if ( line == null ) {

         readLine();

      }

      line = null;

   } // end of skipToNextLine



   /**

    * Close the Reader.

    */

   public void close() throws IOException {

      if ( r != null ) {

         r.close();

         r = null;

      }

   } // end of close



   /**

    * @param args  [0]: The name of the file.

    */

   private static void testSingleTokens(String[] args) {

      if ( debugging ) {

         try {

            // read test file

              CSVReader csv = new CSVReader(new FileReader(args[0]), ',');

           try {

               while ( true ) {

                  System.out.println(csv.get());

               }

            } catch ( EOFException  e ) {

                }

                csv.close();

         } catch ( IOException  e ) {

            e.printStackTrace();

            System.out.println(e.getMessage());

         }

      } // end if

   } // end of testSingleTokens



   /**

    * @param args  [0]: The name of the file.

    */

   private static void testLines(String[] args) {

      int lineCounter = 0;

      String loadLine[] = null;

      String DEL = ",";



      if ( debugging ) {

         try {

            // read test file

            CSVReader csv = new CSVReader(new FileReader(args[0]), ',');



            while( (loadLine = csv.getLine()) != null) {

               lineCounter++;

               StringBuffer logBuffer = new StringBuffer();

               String logLine;

               //log.debug("#" + lineCounter +" : '" + loadLine.length + "'");

               logBuffer.append(loadLine[0]); // write first token, then write DEL in loop and the whole rest.

               for(int i=1; i < loadLine.length; i++) {

                  logBuffer.append(DEL).append(loadLine[i]);

               }

               logLine = logBuffer.toString();

               logLine.substring(0, logLine.lastIndexOf(DEL));

               //logLine.delete(logLine.lastIndexOf(DEL), logLine.length()); // is supported since JDK 1.4

               //System.out.println("#" + lineCounter +" : '" + loadLine.length + "' " + logLine);

               System.out.println(logLine);

            } // end of while

                csv.close();

         } catch ( IOException  e ) {

            e.printStackTrace();

            System.out.println(e.getMessage());

         }

      } // end if

   } // end of testLines



   /**

    * Test driver

    *

    * @param args  [0]: The name of the file.

    */

   static public void main(String[] args) {

      //testSingleTokens(args);

      testLines(args);

   } // end main

} // end CSVReader



// end of file
Reads CSV (Comma Separated Value) files : CSV File « Development Class « Java