001    /*
002    // $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
003    // Package org.eigenbase.xom is an XML Object Mapper.
004    // Copyright (C) 2008-2008 The Eigenbase Project
005    // Copyright (C) 2008-2008 Disruptive Tech
006    // Copyright (C) 2008-2008 LucidEra, Inc.
007    //
008    // This library is free software; you can redistribute it and/or modify it
009    // under the terms of the GNU Lesser General Public License as published by the
010    // Free Software Foundation; either version 2 of the License, or (at your
011    // option) any later version approved by The Eigenbase Project.
012    //
013    // This library is distributed in the hope that it will be useful,
014    // but WITHOUT ANY WARRANTY; without even the implied warranty of
015    // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
016    // GNU Lesser General Public License for more details.
017    //
018    // You should have received a copy of the GNU Lesser General Public License
019    // along with this library; if not, write to the Free Software
020    // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
021    */
022    package org.eigenbase.xom.wrappers;
023    
024    import org.eigenbase.xom.*;
025    import org.w3c.dom.Node;
026    
027    import java.util.*;
028    import java.io.PrintWriter;
029    
030    /**
031     * Quick and dirty XML parser that finds the precise start and end
032     * position of all nodes in a document. Also finds all line endings, so
033     * that character offsets can be converted to line/column positions.
034     *
035     * @author jhyde
036     * @since 13 October, 2008
037     * @version $Id: //open/util/resgen/src/org/eigenbase/xom/wrappers/Annotator.java#4 $
038     */
039    public class Annotator {
040        private final List/*<LocInfo>*/ locInfoList = new ArrayList();
041        private int[] lineStartPositions;
042        private final String xml;
043        private final Map/*<DOMWrapper, LocInfo>*/ wrapperLocMap =
044            new HashMap();
045        private final Map/*<Node, LocInfo>*/ nodeLocMap = new HashMap();
046        private int seq; // workspace for populateMap
047    
048        /**
049         * Creates an Annotator.
050         *
051         * <p>For testing purposes, <code>wrapper</code> may be null. Parses the XML
052         * but does not build the mapping from location information to DOM nodes.
053         *
054         * @param xml XML source string
055         * @param def Wrapper around root DOM node
056         */
057        Annotator(String xml, DOMWrapper def) {
058            this.xml = xml;
059            parse(xml);
060            if (def != null) {
061                seq = 0;
062                populateMap(def);
063                assert this.nodeLocMap.size() == this.wrapperLocMap.size();
064            }
065        }
066    
067        public Location getLocation(DOMWrapper wrapper) {
068            LocInfo location0 = (LocInfo) wrapperLocMap.get(wrapper);
069            if (location0 == null) {
070                location0 = (Annotator.LocInfo)
071                    nodeLocMap.get(((W3CDOMWrapper) wrapper).node);
072                if (location0 == null) {
073                    return null;
074                }
075            }
076            final LocInfo location = location0;
077            return new Location() {
078                public int getStartLine() {
079                    return getLine(getStartPos()) + 1;
080                }
081    
082                public int getStartColumn() {
083                    return getCol(getStartPos()) + 1;
084                }
085    
086                public int getStartPos() {
087                    return location.startTagStartPos;
088                }
089    
090                public int getEndLine() {
091                    return getLine(getEndPos()) + 1;
092                }
093    
094                public int getEndColumn() {
095                    return getCol(getEndPos()) + 1;
096                }
097    
098                public int getEndPos() {
099                    return location.endTagEndPos >= 0
100                        ? location.endTagEndPos
101                        : location.startTagEndPos;
102                }
103    
104                public String getText(boolean headOnly) {
105                    return location.getText(headOnly);
106                }
107    
108                public String toString() {
109                    return location.toString(Annotator.this);
110                }
111            };
112        }
113    
114        /**
115         * Returns the list of LocInfo. For testing.
116         *
117         * @return list of LocInfo.
118         */
119        List getLocInfoList() {
120            return locInfoList;
121        }
122    
123        // enum State
124        private static final int
125            STATE_NORMAL = 0,
126            STATE_TAG = 1,
127            STATE_ENDTAG = 2,
128            STATE_QUOT = 3,
129            STATE_APOS = 4,
130            STATE_COMMENT = 5,
131            STATE_CDATA = 6;
132    
133        void parse(String s)
134        {
135            final ArrayStack/*<LocInfo>*/ lockInfoStack = new ArrayStack();
136            final List lineStartPositions = new ArrayList();
137            int state = STATE_NORMAL;
138            final int count = s.length();
139            int i = 0;
140            int last = 0;
141            lineStartPositions.add(new Integer(i));
142            lockInfoStack.push(null);
143            LocInfo location = null;
144            loop:
145            while (i < count) {
146                final char c = s.charAt(i);
147                switch (c) {
148                case '<':
149                    stateSwitch:
150                    switch (state) {
151                    case STATE_NORMAL:
152                        if (i > last) {
153                            // Unlike other node types, we create the LocInfo
154                            // at the end of the element. No need to add the node
155                            // to the stack, because we'd just remove it again.
156                            LocInfo loc2 =
157                                new LocInfo(locInfoList.size(), TYPE_TEXT, last);
158                            loc2.endTagEndPos = i;
159                            locInfoList.add(loc2);
160                        }
161                        if (i + 1 < count) {
162                            final char c1 = s.charAt(i + 1);
163                            switch (c1) {
164                            case '/':
165                                // ^</Tag>
166                                state = STATE_ENDTAG;
167                                assert location != null;
168                                break stateSwitch;
169                            case '?':
170                                // ^<?xml ... ?>
171                                location =
172                                    new LocInfo(
173                                        locInfoList.size(),
174                                        TYPE_PROCESSING_INSTRUCTION, i);
175                                locInfoList.add(location);
176                                state = STATE_TAG;
177                                i += "<?".length();
178                                continue loop;
179                            case '!':
180                                if (s.startsWith("--", i + 2)) {
181                                    // ^<!--
182                                    location =
183                                        new LocInfo(
184                                            locInfoList.size(),
185                                            TYPE_COMMENT, i);
186                                    locInfoList.add(location);
187                                    state = STATE_COMMENT;
188                                    i += "<!--".length();
189                                    continue loop;
190                                }
191                                if (s.startsWith("[CDATA[", i + 2)) {
192                                    // ^<![CDATA[
193                                    location =
194                                        new LocInfo(
195                                            locInfoList.size(),
196                                            TYPE_CDATA_SECTION, i);
197                                    locInfoList.add(location);
198                                    state = STATE_CDATA;
199                                    i += "<![CDATA[".length();
200                                    continue loop;
201                                }
202                                break;
203                            }
204                        }
205                        // Start of an element,
206                        // ^<Tag a1=v a2=v>
207                        // Don't push until we see end of the head tag <Tag ... ^>
208                        state = STATE_TAG;
209                        location = new LocInfo(locInfoList.size(), TYPE_ELEMENT, i);
210                        locInfoList.add(location);
211                        ++i;
212                        continue loop;
213                    }
214                    break;
215    
216                case '>':
217                    switch (state) {
218                    case STATE_TAG:
219                        ++i;
220                        assert location != null;
221                        switch (location.type) {
222                        case TYPE_PROCESSING_INSTRUCTION:
223                            // <? ... ?^>
224                        case TYPE_CDATA_SECTION:
225                            // <![CDATA[ ... ]]^>
226                        case TYPE_COMMENT:
227                            // <!-- ... --^>
228                            location.endTagEndPos = i;
229                            location = (LocInfo) lockInfoStack.peek();
230                            break;
231                        default:
232                            // <Tag^>
233                            location.startTagEndPos = i;
234                            lockInfoStack.push(location);
235                            break;
236                        }
237                        last = i;
238                        state = STATE_NORMAL;
239                        continue loop;
240    
241                    case STATE_ENDTAG:
242                        // </Tag^>
243                        ++i;
244                        assert location != null;
245                        location.endTagEndPos = i;
246                        try {
247                            location = (LocInfo) lockInfoStack.pop();
248                        } catch (IndexOutOfBoundsException e) {
249                            throw new RuntimeException(
250                                "i=" + i + ", xml=" + xml.substring(i)
251                                    + ", nodeList=" + locInfoList,
252                                e);
253                        }
254                        last = i;
255                        state = STATE_NORMAL;
256                        continue loop;
257                    }
258                    break;
259    
260                case '/':
261                    switch (state) {
262                    case STATE_TAG:
263                        ++i;
264                        if (i < count && s.charAt(i) == '>') {
265                            // <Tag a1=v1 a2=v2 ^/>
266                            ++i;
267                            location.endTagEndPos = i;
268                            // no need to pop; we never pushed when we saw '<'
269                            location = (LocInfo) lockInfoStack.peek();
270                            last = i;
271                            state = STATE_NORMAL;
272                        }
273                        continue loop;
274                    }
275                    break;
276    
277                case ']':
278                    switch (state) {
279                    case STATE_CDATA:
280                        if (s.startsWith("]>", i + 1)) {
281                             // <![CDATA[ ... ^]]>
282                            state = STATE_NORMAL;
283                            i += "]]>".length();
284                            location.endTagEndPos = i;
285                            location = (LocInfo) lockInfoStack.peek();
286                            last = i;
287                            continue loop;
288                        }
289                    }
290                    break;
291    
292                case '-':
293                    switch (state) {
294                    case STATE_COMMENT:
295                        if (s.startsWith("->", i + 1)) {
296                            // <!-- xxxxx^-->
297                            i += "-->".length();
298                            location.endTagEndPos = i;
299                            last = i;
300                            location = (LocInfo) lockInfoStack.peek();
301                            state = STATE_NORMAL;
302                            continue loop;
303                        }
304                    }
305                    break;
306    
307                case '\r':
308                    ++i;
309                    if (i < count && s.charAt(i) == '\n') {
310                        // only count windows line ending CR LF as one line
311                        ++i;
312                    }
313                    lineStartPositions.add(new Integer(i));
314                    continue loop;
315    
316                case '\n':
317                    ++i;
318                    lineStartPositions.add(new Integer(i));
319                    continue loop;
320    
321                case '\'':
322                    switch (state) {
323                    case STATE_APOS:
324                        // a='xxx^'
325                        state = STATE_TAG;
326                        break;
327                    case STATE_TAG:
328                        // a=^'xxx'
329                        state = STATE_APOS;
330                        break;
331                    case STATE_QUOT:
332                        // a="doesn^'t matter"
333                    default:
334                        break;
335                    }
336                    break;
337    
338                case '"':
339                    switch (state) {
340                    case STATE_QUOT:
341                        // a="xxx^"
342                        state = STATE_TAG;
343                        break;
344                    case STATE_TAG:
345                        // a=^"xxx"
346                        state = STATE_QUOT;
347                        break;
348                    case STATE_APOS:
349                        // a='doesn^"t matter'
350                    default:
351                        break;
352                    }
353                    break;
354                }
355    
356                ++i;
357            }
358            this.lineStartPositions = new int[lineStartPositions.size()];
359            for (int j = 0; j < lineStartPositions.size(); j++) {
360                this.lineStartPositions[j] =
361                    ((Integer) lineStartPositions.get(j)).intValue();
362            }
363        }
364    
365        private void populateMap(DOMWrapper def)
366        {
367            final int defType = def.getType();
368            LocInfo location;
369            while (true) {
370                location = (LocInfo) locInfoList.get(seq++);
371                if (defType == DOMWrapper.ELEMENT
372                    && location.type == TYPE_ELEMENT)
373                {
374                    break;
375                }
376                if (defType == DOMWrapper.CDATA
377                    && location.type == TYPE_TEXT)
378                {
379                    break;
380                }
381                if (seq >= locInfoList.size()) {
382                    return;
383                }
384            }
385            wrapperLocMap.put(def, location);
386            nodeLocMap.put(((W3CDOMWrapper) def).node, location);
387            final DOMWrapper[] elementChildren = def.getElementChildren();
388            for (int i = 0; i < elementChildren.length; i++) {
389                DOMWrapper domWrapper = elementChildren[i];
390                populateMap(domWrapper);
391            }
392        }
393    
394        /**
395         * Returns the line that a character position falls on. The first line in a
396         * document is numbered 0.
397         *
398         * @param pos Character position
399         * @return Line (starting from 0)
400         */
401        int getLine(int pos)
402        {
403            int index = Arrays.binarySearch(lineStartPositions, pos);
404            if (index >= 0) {
405                return index;
406            } else {
407                return -2 - index;
408            }
409        }
410    
411        /**
412         * Returns the column that a character position falls on. The first column
413         * in a line is numbered 0.
414         *
415         * @param pos Character position
416         * @return column (starting from 0)
417         */
418        int getCol(int pos)
419        {
420            int index = Arrays.binarySearch(lineStartPositions, pos);
421            if (index >= 0) {
422                return 0;
423            } else {
424                index = -2 - index;
425                return pos - lineStartPositions[index];
426            }
427        }
428    
429        void list(PrintWriter pw)
430        {
431            for (int i = 0; i < locInfoList.size(); i++) {
432                LocInfo location = (LocInfo) locInfoList.get(i);
433                pw.println(
434                    location.seq + ": " + location.toString(this) + " ["
435                        + location.getText(xml) + "]");
436            }
437            pw.flush();
438        }
439    
440        // enum Type
441        private static final int
442            TYPE_ELEMENT = Node.ELEMENT_NODE,
443            TYPE_PROCESSING_INSTRUCTION = Node.PROCESSING_INSTRUCTION_NODE,
444            TYPE_COMMENT = Node.COMMENT_NODE,
445            TYPE_CDATA_SECTION = Node.CDATA_SECTION_NODE,
446            TYPE_TEXT = Node.TEXT_NODE;
447    
448        class LocInfo {
449            /** Sequence in document, ordered by start position (prefix order) */
450            final int seq;
451            /** Node type, typically {@link Node#ELEMENT_NODE}. */
452            final int startTagStartPos;
453            final int type;
454            int startTagEndPos = -1; // -1 if entity is a single tag
455            int endTagEndPos = -1;
456    
457            /**
458             * Creates a LocInfo.
459             *
460             * @param seq Sequence number in document
461             * @param nodeType Node type, typically {@link Node#ELEMENT_NODE}.
462             * @param startTagStartPos Position of start of element
463             */
464            LocInfo(int seq, int nodeType, int startTagStartPos) {
465                this.seq = seq;
466                this.type = nodeType;
467                this.startTagStartPos = startTagStartPos;
468            }
469    
470            public String toString(Annotator annotator) {
471                return "line " + annotator.getLine(startTagStartPos)
472                    + ", column " + annotator.getCol(startTagStartPos);
473            }
474    
475            /**
476             * Returns the fragment of source XML that this node encompasses.
477             *
478             * @param xml Whole source XML
479             * @return fragment of source XML
480             */
481            public String getText(String xml) {
482                return xml.substring(
483                    startTagStartPos,
484                    endTagEndPos >= 0 ? endTagEndPos
485                        : xml.length());
486            }
487    
488            /**
489             * Returns the fragment of source XML corresponding to the head tag
490             * of this element, if this is an element, otherwise the whole node.
491             *
492             * @param xml Whole source XML
493             * @return fragment of source XML
494             */
495            public String getHeadText(String xml) {
496                return xml.substring(
497                    startTagStartPos,
498                    startTagEndPos >= 0 ? startTagEndPos
499                        : endTagEndPos >= 0 ? endTagEndPos
500                            : xml.length());
501            }
502    
503            public String toString() {
504                return getHeadText(xml);
505            }
506    
507            /**
508             * Returns the text of this location. Specification as for
509             * {@link org.eigenbase.xom.Location#getText(boolean)}.
510             *
511             * @param headOnly Whether to return only the head of elements
512             * @return Source text underlying a location
513             */
514            public String getText(boolean headOnly) {
515                return xml.substring(
516                    startTagStartPos,
517                    headOnly && startTagEndPos >= 0
518                        ? startTagEndPos
519                        : endTagEndPos >= 0
520                        ? endTagEndPos
521                        : xml.length());
522            }
523        }
524    
525        /**
526         * Similar to {@link Stack} but based on {@link ArrayList} instead of
527         * {@link Vector}, and therefore more efficient.
528         */
529        private static class ArrayStack extends ArrayList {
530            public final void push(Object t)
531            {
532                if (false) System.out.println(size() + " push [" + t + "]");
533                add(t);
534            }
535    
536            public final Object peek()
537            {
538                return get(size() - 1);
539            }
540    
541            public final Object pop()
542            {
543                final int index = size() - 1;
544                Object t = remove(index);
545                if (false) System.out.println(size() + " pop  [" + t + "]");
546                return get(index - 1);
547            }
548        }
549    }
550    
551    // End Annotator.java