A simple FilterReader that strips HTML tags out of a stream of characters : 缓冲字符读 « 文件输入输出


 

 

/*

 * Copyright (c) 2004 David Flanagan.  All rights reserved.

 * This code is from the book Java Examples in a Nutshell, 3nd Edition.

 * It is provided AS-IS, WITHOUT ANY WARRANTY either expressed or implied.

 * You may study, use, and modify it for any non-commercial purpose,

 * including teaching and use in open-source projects.

 * You may distribute it non-commercially as long as you retain this notice.

 * For a commercial use license, or to purchase the book, 

 * please visit http://www.davidflanagan.com/javaexamples3.

 */



import java.io.BufferedReader;

import java.io.FileReader;

import java.io.FilterReader;

import java.io.IOException;

import java.io.Reader;



/**

 * A simple FilterReader that strips HTML tags (or anything between pairs of

 * angle brackets) out of a stream of characters.

 */

public class RemoveHTMLReader extends FilterReader {

  /** A trivial constructor. Just initialize our superclass */

  public RemoveHTMLReader(Reader in) {

    super(in);

  }



  boolean intag = false; // Used to remember whether we are "inside" a tag



  /**

   * This is the implementation of the no-op read() method of FilterReader. It

   * calls in.read() to get a buffer full of characters, then strips out the

   * HTML tags. (in is a protected field of the superclass).

   */

  public int read(char[] buf, int from, int len) throws IOException {

    int numchars = 0; // how many characters have been read

    // Loop, because we might read a bunch of characters, then strip them

    // all out, leaving us with zero characters to return.

    while (numchars == 0) {

      numchars = in.read(buf, from, len); // Read characters

      if (numchars == -1)

        return -1; // Check for EOF and handle it.



      // Loop through the characters we read, stripping out HTML tags.

      // Characters not in tags are copied over previous tags

      int last = from; // Index of last non-HTML char

      for (int i = from; i < from + numchars; i++) {

        if (!intag) { // If not in an HTML tag

          if (buf[i] == '<')

            intag = true; // check for tag start

          else

            buf[last++] = buf[i]; // and copy the character

        } else if (buf[i] == '>')

          intag = false; // check for end of tag

      }

      numchars = last - from; // Figure out how many characters remain

    } // And if it is more than zero characters

    return numchars; // Then return that number.

  }



  /**

   * This is another no-op read() method we have to implement. We implement it

   * in terms of the method above. Our superclass implements the remaining

   * read() methods in terms of these two.

   */

  public int read() throws IOException {

    char[] buf = new char[1];

    int result = read(buf, 0, 1);

    if (result == -1)

      return -1;

    else

      return (int) buf[0];

  }



  /** The test program: read a text file, strip HTML, print to console */

  public static void main(String[] args) {

    try {

      if (args.length != 1)

        throw new IllegalArgumentException("Wrong number of args");

      // Create a stream to read from the file and strip tags from it

      BufferedReader in = new BufferedReader(new RemoveHTMLReader(new FileReader(args[0])));

      // Read line by line, printing lines to the console

      String line;

      while ((line = in.readLine()) != null)

        System.out.println(line);

      in.close(); // Close the stream.

    } catch (Exception e) {

      System.err.println(e);

      System.err.println("Usage: java RemoveHTMLReader$Test" + " <filename>");

    }

  }

}
A simple FilterReader that strips HTML tags out of a stream of characters : 缓冲字符读 « 文件输入输出 « Java