001: // Copyright © 2002-2007 Canoo Engineering AG, Switzerland.
002: package com.canoo.webtest.steps.verify;
003:
004: import com.canoo.webtest.boundary.HtmlUnitBoundary;
005: import com.canoo.webtest.engine.Context;
006: import com.canoo.webtest.engine.StepFailedException;
007: import com.canoo.webtest.engine.RegExStringVerifier;
008: import com.canoo.webtest.steps.Step;
009: import com.canoo.webtest.util.ConversionUtil;
010: import com.gargoylesoftware.htmlunit.Page;
011: import com.gargoylesoftware.htmlunit.WebClient;
012: import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
013: import com.gargoylesoftware.htmlunit.html.HtmlPage;
014:
015: import org.apache.commons.lang.StringUtils;
016: import org.apache.log4j.Logger;
017: import org.xml.sax.SAXException;
018:
019: import java.net.MalformedURLException;
020: import java.net.URL;
021: import java.util.HashSet;
022: import java.util.Iterator;
023: import java.util.Map;
024: import java.util.Set;
025:
026: /**
027: * @author Dierk Koenig, Urs-Peter Häss
028: * @author Marc Guillemot, Paul King, Brian Hubbard
029: * @webtest.step category="Core"
030: * name="verifyLinks"
031: * alias="verifylinks"
032: * description="This step checks the validity of all links on the current page. Non-<key>HTML</key> pages (CSS, <key>javascript</key>, <key>XML</key> files) are not checked for internal links. Non-<key>HTTP</key> links (mail addresses, ftp etc.) are not checked or followed."
033: */
034: public class VerifyLinks extends Step {
035: private static final Logger LOG = Logger
036: .getLogger(VerifyLinks.class);
037: private String fBaseHost;
038: private int fMaxDepth;
039: private String fMaxDepthStr;
040: private int fCurrentDepth;
041: private boolean fOnsiteonly;
042: private String fExcludes;
043: private String fIncludes;
044: private final Set fFailedVisits = new HashSet();
045: private final Set fVisitedUrls = new HashSet();
046: private int fValidLinks;
047: private boolean fIgnoreForeignJSErrors;
048:
049: protected Set getFailedVisits() {
050: return fFailedVisits;
051: }
052:
053: public String getDepth() {
054: return fMaxDepthStr;
055: }
056:
057: /**
058: * @webtest.parameter required="no"
059: * default="0"
060: * description="The <em>depth</em> parameter defines the depth of the recursive search for broken links on sub-pages."
061: */
062: public void setDepth(String depth) {
063: fMaxDepthStr = depth;
064: }
065:
066: /**
067: * @webtest.parameter required="no"
068: * default="<empty>"
069: * description="If <em>excludes</em> is set then each link found is compared to the defined string (via regexp), if it matches then the link is not followed."
070: */
071: public void setExcludes(String regex) {
072: fExcludes = regex;
073: }
074:
075: public String getExcludes() {
076: return fExcludes;
077: }
078:
079: /**
080: * @webtest.parameter required="no"
081: * default="<all>"
082: * description="If <em>includes</em> is set then each link found is compared to the defined string (via regexp), if it matches then the link is processed, others are ignored."
083: */
084: public void setIncludes(String regex) {
085: fIncludes = regex;
086: }
087:
088: public String getIncludes() {
089: return fIncludes;
090: }
091:
092: /**
093: * @webtest.parameter required="no"
094: * default="false"
095: * description="If <em>onsiteonly</em> is set to <em>true</em>, the recursive search for invalid links is limited to the local host.
096: * Only the initial link to a foreign host is checked, but no deeper search is performed."
097: */
098: public void setOnsiteonly(final boolean onsiteonly) {
099: fOnsiteonly = onsiteonly;
100: }
101:
102: /**
103: *
104: * @webtest.parameter required="no"
105: * default="false"
106: * description="Indicates if JavaScript errors should be ignored on visited pages from a different host
107: * than the current page."
108: */
109: public void setIgnoreForeignJSErrors(final boolean b) {
110: fIgnoreForeignJSErrors = b;
111: }
112:
113: public void doExecute() throws SAXException, MalformedURLException {
114: verifyProperties();
115: nullResponseCheck();
116: final Context context = getContext();
117: final HtmlPage htmlPage = context.getCurrentHtmlResponse(this );
118: LOG
119: .info("Examining page with title="
120: + htmlPage.getTitleText());
121: if (!StringUtils.isEmpty(getIncludes())) {
122: LOG.info("Only including links which match '"
123: + getIncludes() + "'");
124: }
125: if (!StringUtils.isEmpty(getExcludes())) {
126: LOG.info("Excluding links which match '" + getExcludes()
127: + "'");
128: }
129: fBaseHost = htmlPage.getWebResponse().getUrl().getHost();
130: final WebClient client = context.getWebClient();
131: checkVisits(client, htmlPage);
132: if (!fFailedVisits.isEmpty()) {
133: throw new StepFailedException(fFailedVisits.size()
134: + " broken link(s): " + brokenLinksToString(), this );
135: }
136: }
137:
138: protected void addComputedParameters(final Map map) {
139: map.put("-> valid links", String.valueOf(fValidLinks));
140: }
141:
142: protected void checkVisits(final WebClient webClient,
143: final HtmlPage response) {
144: final Set urls = getGoodLinks(response);
145: final RegExStringVerifier verifier = new RegExStringVerifier();
146: for (final Iterator iter = urls.iterator(); iter.hasNext();) {
147: final URL url = (URL) iter.next();
148: if (fVisitedUrls.contains(url)) {
149: LOG.debug("Skipped already visited: " + url);
150: fValidLinks++;
151: continue;
152: }
153: if (!StringUtils.isEmpty(getIncludes())
154: && (!verifier.verifyStrings(getIncludes(), url
155: .toString()))) {
156: LOG
157: .info("Skipped link as it doesn't match the includes list: "
158: + url);
159: continue;
160: }
161: if (!StringUtils.isEmpty(getExcludes())
162: && (verifier.verifyStrings(getExcludes(), url
163: .toString()))) {
164: LOG.info("Skipped link as matched the excludes list: "
165: + url);
166: continue;
167: }
168: visit(response, url, webClient);
169: }
170: }
171:
172: protected void visit(final HtmlPage referingPage, final URL url,
173: final WebClient webClient) {
174: final boolean ignoreJSErrorsOriginal = webClient
175: .isThrowExceptionOnScriptError();
176: if (fIgnoreForeignJSErrors && isForeignHost(url)) {
177: LOG.info("Ignore JS errors (if any) for " + url);
178: webClient.setThrowExceptionOnScriptError(false);
179: }
180: final Page response = HtmlUnitBoundary.tryGetPageNoFail(url,
181: webClient);
182: webClient
183: .setThrowExceptionOnScriptError(ignoreJSErrorsOriginal);
184:
185: fVisitedUrls.add(url);
186: if (response == null) {
187: fFailedVisits.add(new ZFailedLink(url, referingPage
188: .getWebResponse().getUrl()));
189: } else {
190: fValidLinks++;
191:
192: if (response instanceof HtmlPage) {
193: followRecursively((HtmlPage) response, webClient);
194: }
195: }
196: }
197:
198: protected void followRecursively(final HtmlPage htmlPage,
199: final WebClient webClient) {
200: LOG.debug("fMaxDepth = " + fMaxDepth);
201: if (fCurrentDepth < fMaxDepth && !stopHunting(htmlPage)) {
202: ++fCurrentDepth;
203: checkVisits(webClient, htmlPage);
204: --fCurrentDepth;
205: }
206: }
207:
208: protected String brokenLinksToString() {
209: StringBuffer sb = new StringBuffer();
210: for (Iterator iter = fFailedVisits.iterator(); iter.hasNext();) {
211: ZFailedLink failedLink = (ZFailedLink) iter.next();
212: sb.append(failedLink.getFailedUrl()).append(" on ").append(
213: failedLink.getReferingUrl()).append("; ");
214: }
215: return sb.toString();
216: }
217:
218: static int getLinkCount(final HtmlPage response) {
219: return getGoodLinks(response).size();
220: }
221:
222: /**
223: * Gets all HTTP links in the response
224: *
225: * @param response
226: * @return a set of {@link URL}
227: */
228: static Set getGoodLinks(final HtmlPage response) {
229: LOG.info("Looking for links in " + response);
230: final Set urls = new HashSet();
231:
232: for (final Iterator iter = response.getAnchors().iterator(); iter
233: .hasNext();) {
234: processLink(response, (HtmlAnchor) iter.next(), urls);
235: }
236:
237: LOG.info(urls.size() + " different links found in page "
238: + response.getWebResponse().getUrl());
239: return urls;
240: }
241:
242: private static void processLink(final HtmlPage response,
243: final HtmlAnchor link, final Set urls) {
244: try {
245: final URL url = response.getFullyQualifiedUrl(link
246: .getHrefAttribute());
247: final String protocol = url.getProtocol();
248: if ("http".equals(protocol) || "https".equals(protocol)) {
249: LOG.info("Adding url to check: " + url);
250: urls.add(url);
251: } else {
252: LOG.info("Skipped link due to protocol: " + url);
253: }
254: } catch (final MalformedURLException e) {
255: LOG.info("Skipped link due to bad url: "
256: + link.getHrefAttribute());
257: }
258: }
259:
260: protected boolean stopHunting(final HtmlPage htmlPage) {
261: return fOnsiteonly
262: && isForeignHost(htmlPage.getWebResponse().getUrl());
263: }
264:
265: protected boolean isForeignHost(final URL url) {
266: return !fBaseHost.equals(url.getHost());
267: }
268:
269: protected void verifyProperties() {
270: fMaxDepth = ConversionUtil.convertToInt(getDepth(), 0);
271: optionalIntegerParamCheck(getDepth(), "depth", true);
272: }
273: }
274:
275: /**
276: * Utility data holder
277: */
278: class ZFailedLink {
279: private URL fFailedUrl;
280: private URL fReferingUrl;
281:
282: ZFailedLink(final URL failedUrl, final URL referingUrl) {
283: fFailedUrl = failedUrl;
284: fReferingUrl = referingUrl;
285: }
286:
287: public URL getFailedUrl() {
288: return fFailedUrl;
289: }
290:
291: public URL getReferingUrl() {
292: return fReferingUrl;
293: }
294: }
|