001: /* Copyright (c) 2006-2007, Vladimir Nikic
002: All rights reserved.
003:
004: Redistribution and use of this software in source and binary forms,
005: with or without modification, are permitted provided that the following
006: conditions are met:
007:
008: * Redistributions of source code must retain the above
009: copyright notice, this list of conditions and the
010: following disclaimer.
011:
012: * Redistributions in binary form must reproduce the above
013: copyright notice, this list of conditions and the
014: following disclaimer in the documentation and/or other
015: materials provided with the distribution.
016:
017: * The name of Web-Harvest may not be used to endorse or promote
018: products derived from this software without specific prior
019: written permission.
020:
021: THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
022: AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
023: IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
024: ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
025: LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
026: CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
027: SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
028: INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
029: CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
030: ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
031: POSSIBILITY OF SUCH DAMAGE.
032:
033: You can contact Vladimir Nikic by sending e-mail to
034: nikic_vladimir@yahoo.com. Please include the word "Web-Harvest" in the
035: subject line.
036: */
037: package org.webharvest.runtime.processors;
038:
039: import java.util.*;
040: import java.util.regex.Matcher;
041: import java.util.regex.Pattern;
042:
043: import org.webharvest.definition.RegexpDef;
044: import org.webharvest.definition.BaseElementDef;
045: import org.webharvest.runtime.Scraper;
046: import org.webharvest.runtime.ScraperContext;
047: import org.webharvest.runtime.scripting.ScriptEngine;
048: import org.webharvest.runtime.templaters.BaseTemplater;
049: import org.webharvest.runtime.variables.*;
050: import org.webharvest.utils.CommonUtil;
051: import org.webharvest.utils.Constants;
052:
053: /**
054: * Regular expression replace processor.
055: */
056: public class RegexpProcessor extends BaseProcessor {
057:
058: private RegexpDef regexpDef;
059:
060: public RegexpProcessor(RegexpDef regexpDef) {
061: super (regexpDef);
062: this .regexpDef = regexpDef;
063: }
064:
065: public IVariable execute(Scraper scraper, ScraperContext context) {
066: ScriptEngine scriptEngine = scraper.getScriptEngine();
067:
068: BaseElementDef patternDef = regexpDef.getRegexpPatternDef();
069: IVariable patternVar = getBodyTextContent(patternDef, scraper,
070: context);
071: debug(patternDef, scraper, patternVar);
072:
073: BaseElementDef sourceDef = regexpDef.getRegexpSourceDef();
074: IVariable source = getBodyListContent(sourceDef, scraper,
075: context);
076: debug(sourceDef, scraper, source);
077:
078: String replace = BaseTemplater.execute(regexpDef.getReplace(),
079: scriptEngine);
080: boolean isReplace = CommonUtil.isBooleanTrue(replace);
081:
082: String maxLoopsString = BaseTemplater.execute(regexpDef
083: .getMax(), scriptEngine);
084: double maxLoops = Constants.DEFAULT_MAX_LOOPS;
085: if (maxLoopsString != null && !"".equals(maxLoopsString.trim())) {
086: maxLoops = Double.parseDouble(maxLoopsString);
087: }
088:
089: Pattern pattern = Pattern.compile(patternVar.toString(),
090: Pattern.DOTALL | Pattern.UNICODE_CASE);
091:
092: List resultList = new ArrayList();
093:
094: List bodyList = source.toList();
095: Iterator it = bodyList.iterator();
096: while (it.hasNext()) {
097: IVariable currVar = (IVariable) it.next();
098: String text = currVar.toString();
099:
100: Matcher matcher = pattern.matcher(text);
101: int groupCount = matcher.groupCount();
102:
103: StringBuffer buffer = new StringBuffer();
104:
105: int index = 0;
106: while (matcher.find()) {
107: index++;
108:
109: // if index exceeds maximum number of matching sequences exists the loop
110: if (maxLoops < index) {
111: break;
112: }
113:
114: for (int i = 0; i <= groupCount; i++) {
115: context.put("_" + i, new NodeVariable(matcher
116: .group(i)));
117: }
118:
119: BaseElementDef resultDef = regexpDef
120: .getRegexpResultDef();
121: IVariable result = getBodyTextContent(resultDef,
122: scraper, context);
123: debug(resultDef, scraper, result);
124:
125: String currResult = (result == null) ? matcher.group(0)
126: : result.toString();
127: if (isReplace) {
128: matcher.appendReplacement(buffer, currResult);
129: } else {
130: resultList.add(new NodeVariable(currResult));
131: }
132:
133: for (int i = 0; i <= groupCount; i++) {
134: context.remove("_" + i);
135: }
136: }
137:
138: if (isReplace) {
139: matcher.appendTail(buffer);
140: resultList.add(new NodeVariable(buffer.toString()));
141: }
142: }
143:
144: return new ListVariable(resultList);
145: }
146:
147: }
|