Source Code Cross Referenced for XmlFileIndexer.java in  » Search-Engine » Lius-0.4 » ca » ulaval » bibl » lius » index » XML » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » Lius 0.4 » ca.ulaval.bibl.lius.index.XML 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:
003:         * LIUS - Lucene Index Update and Search
004:         * http://sourceforge.net/projects/lius/
005:         *
006:         * Copyright (c) 2005, Laval University Library.  All rights reserved.
007:         * 
008:         * This library is free software; you can redistribute it and/or
009:         * modify it under the terms of the GNU Lesser General Public
010:         * License as published by the Free Software Foundation; either
011:         * version 2.1 of the License, or (at your option) any later version.
012:         * 
013:         * This library is distributed in the hope that it will be useful,
014:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
015:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
016:         * Lesser General Public License for more details.
017:         * 
018:         * You should have received a copy of the GNU Lesser General Public
019:         * License along with this library; if not, write to the Free Software
020:         * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
021:         */
022:
023:        package ca.ulaval.bibl.lius.index.XML;
024:
025:        import java.io.File;
026:        import java.io.IOException;
027:        import java.lang.reflect.InvocationTargetException;
028:        import java.util.ArrayList;
029:        import java.util.Collection;
030:        import java.util.Iterator;
031:        import java.util.List;
032:        import java.util.Map;
033:
034:        import org.apache.commons.beanutils.BeanUtils;
035:        import org.apache.log4j.Logger;
036:        import org.jaxen.JaxenException;
037:        import org.jaxen.SimpleNamespaceContext;
038:        import org.jaxen.jdom.JDOMXPath;
039:        import org.jdom.Attribute;
040:        import org.jdom.CDATA;
041:        import org.jdom.Comment;
042:        import org.jdom.Element;
043:        import org.jdom.EntityRef;
044:        import org.jdom.JDOMException;
045:        import org.jdom.Namespace;
046:        import org.jdom.ProcessingInstruction;
047:        import org.jdom.Text;
048:        import org.jdom.input.SAXBuilder;
049:
050:        import ca.ulaval.bibl.lius.Lucene.LuceneActions;
051:        import ca.ulaval.bibl.lius.config.LiusConfig;
052:        import ca.ulaval.bibl.lius.config.LiusConfigBuilder;
053:        import ca.ulaval.bibl.lius.config.LiusField;
054:        import ca.ulaval.bibl.lius.index.Indexer;
055:
056:        /**
057:         * 
058:         * Classe se basant sur JDOM et XPATH pour indexer des fichiers XML.
059:         * 
060:         * <br/><br/>
061:         * 
062:         * Class based on JDOM and XPATH for indexing XML files.
063:         * 
064:         * @author Rida Benjelloun (rida.benjelloun@bibl.ulaval.ca)
065:         *  
066:         */
067:
068:        public class XmlFileIndexer
069:
070:        extends Indexer {
071:
072:            private SimpleNamespaceContext nsc = new SimpleNamespaceContext();
073:
074:            private String toIndex = null;
075:
076:            static Logger logger = Logger.getRootLogger();
077:
078:            /**
079:             * 
080:             * Permet de parser un fichier XML et de retourner un objet de type JDOM
081:             * Document.
082:             * 
083:             * <br/><br/>
084:             * 
085:             * Parse an XML file and returns a JDOM object.
086:             *  
087:             */
088:
089:            public Object parse(Object file) {
090:
091:                org.jdom.Document xmlDoc = new org.jdom.Document();
092:
093:                try {
094:
095:                    SAXBuilder builder = new SAXBuilder();
096:
097:                    builder.setValidation(false);
098:
099:                    xmlDoc = builder.build(new File((String) file));
100:
101:                }
102:
103:                catch (JDOMException e) {
104:
105:                    logger.error(e.getMessage());
106:
107:                }
108:
109:                catch (IOException e) {
110:
111:                    logger.error(e.getMessage());
112:
113:                }
114:
115:                return xmlDoc;
116:
117:            }
118:
119:            /**
120:             * 
121:             * Méthode retournant un objet de type "Lucene document" à partir du fichier
122:             * 
123:             * XML à indexer et du fichier de configuration de Lius exprimé sous forme
124:             * 
125:             * d'objet de type "LiusConfig".
126:             * 
127:             * <br/><br/>
128:             * 
129:             * Method that returns Lucene Document object from an XML file to index and
130:             * 
131:             * the Lius configuration file as a LiusConfig object.
132:             * 
133:             * 
134:             *  
135:             */
136:
137:            public org.apache.lucene.document.Document createLuceneDocument(
138:                    String
139:
140:                    xmlFile, LiusConfig lc) {
141:
142:                Collection liusFields = lc.getXmlFileFields();
143:
144:                org.apache.lucene.document.Document luceneDoc = createLuceneDocument(
145:
146:                xmlFile,
147:
148:                liusFields);
149:
150:                return luceneDoc;
151:
152:            }
153:
154:            /**
155:             * 
156:             * Permet de placer un noeud XML dans un document Lucene. Chaque élément du
157:             * 
158:             * noeud est indexé en se basant sur une collection d'objets de type
159:             * "LiusField"
160:             * 
161:             * qui contient le nom du champs dans lucene, l'expression Xpath pour
162:             * séléctionner
163:             * 
164:             * le noeud et le séparateur d'occurences si on veut placer les différentes
165:             * 
166:             * occurences d'un même élément dans le même champs; dans la cas contraire
167:             * chaque
168:             * 
169:             * élément sera placé dans son propre champs.
170:             * 
171:             * <br/><br/>
172:             * 
173:             * Place an XML node in a Lucene document. Each element of the node is
174:             * indexed
175:             * 
176:             * based on a collection of type LiusField containing the name of the field
177:             * in
178:             * 
179:             * Lucene, the XPath expression to select the node and the hits separator if
180:             * we want
181:             * 
182:             * differents hits of a same element in the same field; in the other case
183:             * each
184:             * 
185:             * element is placed in is own field.
186:             *  
187:             */
188:
189:            public org.apache.lucene.document.Document storeNodeInLuceneDocument(
190:                    Object
191:
192:                    xmlDoc, Collection liusFields) {
193:
194:                Collection resColl = getPopulatedCollection(xmlDoc, liusFields);
195:
196:                org.apache.lucene.document.Document luceneDoc = LuceneActions.
197:
198:                getSingletonInstance().populateLuceneDoc(resColl);
199:
200:                return luceneDoc;
201:
202:            }
203:
204:            /*
205:             * private void populateField(LiusField newField, LiusField lf) {
206:             * 
207:             * newField.setName(lf.getName());
208:             * 
209:             * newField.setType(lf.getType());
210:             * 
211:             * newField.setXpathSelect(lf.getXpathSelect());
212:             * 
213:             * newField.setDateFormat(lf.getDateFormat());
214:             *  }
215:             */
216:
217:            /**
218:             * 
219:             * Méthode permettant de concaténer les occurences multiples d'un élément
220:             * qui
221:             * 
222:             * vont être stockées dans le même document Lucene.
223:             * 
224:             * <br/><br/>
225:             * 
226:             * Method that concatenates multiple hist of an element which will be saved
227:             * in
228:             * 
229:             * the same Lucene document.
230:             *  
231:             */
232:
233:            public String concatOccurance(Object xmlDoc,
234:
235:            String xpath,
236:
237:            String concatSep) {
238:
239:                StringBuffer chaineConcat = new StringBuffer();
240:
241:                try {
242:
243:                    JDOMXPath xp = new JDOMXPath(xpath);
244:
245:                    xp.setNamespaceContext(nsc);
246:
247:                    List ls = xp.selectNodes(xmlDoc);
248:
249:                    Iterator i = ls.iterator();
250:
251:                    int j = 0;
252:
253:                    while (i.hasNext()) {
254:
255:                        j++;
256:
257:                        String text = "";
258:
259:                        Object obj = (Object) i.next();
260:
261:                        if (obj instanceof  Element) {
262:
263:                            Element elem = (Element) obj;
264:
265:                            text = elem.getText().trim();
266:
267:                        }
268:
269:                        else if (obj instanceof  Attribute) {
270:
271:                            Attribute att = (Attribute) obj;
272:
273:                            text = att.getValue().trim();
274:
275:                        }
276:
277:                        else if (obj instanceof  Text) {
278:
279:                            Text txt = (Text) obj;
280:
281:                            text = txt.getText().trim();
282:
283:                        }
284:
285:                        else if (obj instanceof  CDATA) {
286:
287:                            CDATA cdata = (CDATA) obj;
288:
289:                            text = cdata.getText().trim();
290:
291:                        }
292:
293:                        else if (obj instanceof  Comment) {
294:
295:                            Comment com = (Comment) obj;
296:
297:                            text = com.getText().trim();
298:
299:                        }
300:
301:                        else if (obj instanceof  ProcessingInstruction) {
302:
303:                            ProcessingInstruction pi = (ProcessingInstruction) obj;
304:
305:                            text = pi.getData().trim();
306:
307:                        }
308:
309:                        else if (obj instanceof  EntityRef) {
310:
311:                            EntityRef er = (EntityRef) obj;
312:
313:                            text = er.toString().trim();
314:
315:                        }
316:
317:                        if (text != "") {
318:
319:                            if (ls.size() == 1) {
320:
321:                                chaineConcat.append(text);
322:
323:                                return chaineConcat.toString().trim();
324:
325:                            }
326:
327:                            else {
328:
329:                                if (ls.size() == j)
330:
331:                                    chaineConcat.append(text);
332:
333:                                else
334:
335:                                    chaineConcat.append(text + " " + concatSep
336:                                            + " ");
337:
338:                            }
339:
340:                        }
341:
342:                    }
343:
344:                }
345:
346:                catch (JaxenException j) {
347:
348:                    logger.error(j.getMessage());
349:
350:                }
351:
352:                return chaineConcat.toString().trim();
353:
354:            }
355:
356:            public void setFileName(String toIndex) {
357:
358:                this .toIndex = toIndex;
359:
360:            }
361:
362:            public String getFileName() {
363:
364:                return toIndex;
365:
366:            }
367:
368:            /**
369:             * 
370:             * Retourne une collection contenant les champs avec les valeurs à indexer
371:             * 
372:             * comme par exemple: le texte integral, titre etc.
373:             * 
374:             * <br/><br/>
375:             * 
376:             * Returns a collection containing the fields with the values to index; like :
377:             * 
378:             * full text, title, etc.
379:             *  
380:             */
381:
382:            public Collection getPopulatedCollection(Object xmlFile,
383:                    Collection liusField) {
384:
385:                Object xmlDoc = null;
386:
387:                List documentNs = null;
388:
389:                Map hm = null;
390:
391:                boolean nsTrouve = false;
392:
393:                boolean isMap = false;
394:
395:                Collection resColl = new ArrayList();
396:
397:                if (xmlFile instanceof  org.jdom.Document ||
398:
399:                xmlFile instanceof  org.jdom.Element)
400:
401:                    xmlDoc = xmlFile;
402:
403:                else
404:
405:                    xmlDoc = parse(xmlFile);
406:
407:                if (xmlDoc instanceof  org.jdom.Document) {
408:
409:                    documentNs = getAllDocumentNs((org.jdom.Document) xmlDoc);
410:
411:                }
412:
413:                Iterator itColl = liusField.iterator();
414:
415:                while (itColl.hasNext()) {
416:
417:                    Object colElem = itColl.next();
418:
419:                    if (colElem instanceof  Map) {
420:
421:                        isMap = true;
422:
423:                        hm = (Map) colElem;
424:
425:                        for (int j = 0; j < documentNs.size(); j++) {
426:
427:                            Collection liusFields = (Collection) hm
428:                                    .get(documentNs.get(j));
429:
430:                            if (liusFields != null) {
431:
432:                                nsTrouve = true;
433:
434:                                extractDataFromElements(xmlDoc, liusFields,
435:                                        resColl);
436:
437:                            }
438:
439:                        }
440:
441:                    }
442:
443:                    if (nsTrouve == false && (colElem instanceof  Map)) {
444:
445:                        extractDataFromElements(xmlDoc, (Collection) hm
446:                                .get("default"), resColl);
447:
448:                    }
449:
450:                }
451:
452:                if (isMap == false)
453:
454:                    extractDataFromElements(xmlDoc, liusField, resColl);
455:
456:                return resColl;
457:
458:            }
459:
460:            public Collection getPopulatedCollection(Object file,
461:                    String liusConfig) {
462:
463:                LiusConfig lc = LiusConfigBuilder.getSingletonInstance()
464:                        .getLiusConfig(
465:
466:                        liusConfig);
467:
468:                return getPopulatedCollection(file, lc);
469:
470:            }
471:
472:            public Collection getPopulatedCollection(Object file, LiusConfig lc) {
473:
474:                return getPopulatedCollection(file, lc.getXmlFileFields());
475:
476:            }
477:
478:            private void extractDataFromElements(Object xmlDoc,
479:                    Collection liusFields,
480:
481:                    Collection resColl) {
482:
483:                Iterator it = liusFields.iterator();
484:
485:                while (it.hasNext()) {
486:
487:                    Object field = it.next();
488:
489:                    if (field instanceof  LiusField) {
490:
491:                        LiusField lf = (LiusField) field;
492:
493:                        if (lf.getOcurSep() != null) {
494:
495:                            String cont = concatOccurance(xmlDoc, lf
496:                                    .getXpathSelect(),
497:
498:                            lf.getOcurSep());
499:
500:                            lf.setValue(cont);
501:
502:                            resColl.add(lf);
503:
504:                        }
505:
506:                        else {
507:
508:                            try {
509:
510:                                JDOMXPath xp = new JDOMXPath(lf
511:                                        .getXpathSelect());
512:
513:                                xp.setNamespaceContext(nsc);
514:
515:                                List selectList = xp.selectNodes(xmlDoc);
516:
517:                                Iterator i = selectList.iterator();
518:
519:                                while (i.hasNext()) {
520:
521:                                    LiusField lfoccur = new LiusField();
522:
523:                                    BeanUtils.copyProperties(lfoccur, lf);
524:
525:                                    Object selection = (Object) i.next();
526:
527:                                    if (selection instanceof  Element) {
528:
529:                                        Element elem = (Element) selection;
530:
531:                                        if (elem.getText().trim() != null &&
532:
533:                                        elem.getText().trim() != "") {
534:
535:                                            lfoccur.setValue(elem.getText());
536:
537:                                            resColl.add(lfoccur);
538:
539:                                        }
540:
541:                                    }
542:
543:                                    else if (selection instanceof  Attribute) {
544:
545:                                        Attribute att = (Attribute) selection;
546:
547:                                        lf.setValue(att.getValue());
548:
549:                                        resColl.add(lf);
550:
551:                                    }
552:
553:                                    else if (selection instanceof  Text) {
554:
555:                                        Text text = (Text) selection;
556:
557:                                        lf.setValue(text.getText());
558:
559:                                        resColl.add(lf);
560:
561:                                    }
562:
563:                                    else if (selection instanceof  CDATA) {
564:
565:                                        CDATA cdata = (CDATA) selection;
566:
567:                                        lf.setValue(cdata.getText());
568:
569:                                        resColl.add(lf);
570:
571:                                    }
572:
573:                                    else if (selection instanceof  Comment) {
574:
575:                                        Comment com = (Comment) selection;
576:
577:                                        lf.setValue(com.getText());
578:
579:                                        resColl.add(lf);
580:
581:                                    }
582:
583:                                    else if (selection instanceof  ProcessingInstruction) {
584:
585:                                        ProcessingInstruction pi = (ProcessingInstruction)
586:
587:                                        selection;
588:
589:                                        lf.setValue(pi.getData());
590:
591:                                        resColl.add(lf);
592:
593:                                    }
594:
595:                                    else if (selection instanceof  EntityRef) {
596:
597:                                        EntityRef er = (EntityRef) selection;
598:
599:                                        lf.setValue(er.toString());
600:
601:                                        resColl.add(lf);
602:
603:                                    }
604:
605:                                }
606:
607:                            }
608:
609:                            catch (JaxenException e) {
610:
611:                                logger.error(e.getMessage());
612:
613:                            }
614:
615:                            catch (InvocationTargetException ex) {
616:
617:                                logger.error(ex.getMessage());
618:
619:                            }
620:
621:                            catch (IllegalAccessException ex) {
622:
623:                                logger.error(ex.getMessage());
624:
625:                            }
626:
627:                        }
628:
629:                    }
630:
631:                    else {
632:
633:                        resColl.add(field);
634:
635:                    }
636:
637:                }
638:
639:            }
640:
641:            public List getAllDocumentNs(org.jdom.Document doc) {
642:
643:                List ls = new ArrayList();
644:
645:                processChildren(doc.getRootElement(), ls);
646:
647:                return ls;
648:
649:            }
650:
651:            private boolean exist(List nsLs, String nsUri) {
652:
653:                if (nsLs.isEmpty())
654:
655:                    return false;
656:
657:                for (int i = 0; i < nsLs.size(); i++) {
658:
659:                    if (((String) nsLs.get(i)).equals(nsUri)) {
660:
661:                        return true;
662:
663:                    }
664:
665:                }
666:
667:                return false;
668:
669:            }
670:
671:            private void processChildren(Element elem, List ns) {
672:
673:                Namespace nsCourent = (Namespace) elem.getNamespace();
674:
675:                String nsUri = (nsCourent.getURI());
676:
677:                if (!exist(ns, nsUri)) {
678:
679:                    ns.add(nsUri.trim());
680:
681:                    nsc.addNamespace(nsCourent.getPrefix(), nsCourent.getURI());
682:
683:                }
684:
685:                List additionalNs = elem.getAdditionalNamespaces();
686:
687:                if (!additionalNs.isEmpty())
688:
689:                    copyNsList(additionalNs, ns);
690:
691:                if (elem.getChildren().size() > 0) {
692:
693:                    List elemChildren = elem.getChildren();
694:
695:                    for (int i = 0; i < elemChildren.size(); i++) {
696:
697:                        processChildren((Element) elemChildren.get(i), ns);
698:
699:                    }
700:
701:                }
702:
703:            }
704:
705:            private void copyNsList(List nsElem, List nsRes) {
706:
707:                for (int i = 0; i < nsElem.size(); i++) {
708:
709:                    Namespace ns = (Namespace) nsElem.get(i);
710:
711:                    nsc.addNamespace(ns.getPrefix(), ns.getURI());
712:
713:                    nsRes.add(ns.getURI().trim());
714:
715:                }
716:
717:            }
718:
719:            /**
720:             * 
721:             * Permet de récupérer les champs de Lius à partir du fichier de
722:             * configuration
723:             * 
724:             * pour effectuer l'indexation.
725:             * 
726:             * <br/><br/>
727:             * 
728:             * Get Lius fields from the configuration file for indexing.
729:             *  
730:             */
731:
732:            public Collection getLiusFields(LiusConfig lc) {
733:
734:                return lc.getXmlFileFields();
735:
736:            }
737:
738:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.