5 Replies Latest reply: Dec 18, 2012 12:56 PM by odie_63 RSS

    Parsing XML feed

    skas
      I have codes as follows:
      SET SERVEROUTPUT ON
      SET DEFINE OFF
      DECLARE
      feedURL VARCHAR2(500);
      parser xmlparser.Parser;
      feedXML xmldom.DOMDocument;
      titles xmldom.DOMNodeList;

      titles_found NUMBER;
      curNode xmldom.DOMNode;
      textChild xmldom.DOMNode;

      BEGIN

      feedURL := 'http://192.168.2.30/cgi-bin/query-meta?v%3Asources=cansumtest&v%3Aproject=query-meta&query=HOOD&render.function=xml-feed-display&content-type=text/xml';

      parser := xmlparser.newParser;
      feedXML := xmlparser.parse( feedURL );
      xmlparser.freeParser(parser);

      titles := xmldom.getElementsByTagName(feedXML,'*');

      FOR j IN 1..xmldom.getLength(titles) LOOP
      curNode := xmldom.item(titles,j-1);
      textChild := xmldom.getFirstChild(curNode);

      dbms_output.put_line('('||LPAD(j,2)||') '||xmldom.getNodeValue(textChild));

      END LOOP;
      xmldom.freeDocument(feedXML);

      END;

      Above codes generate following results:
      ( 1)
      ( 2)
      ( 3)
      ( 4)
      ( 5)
      ( 6)
      ( 7)
      ( 8)
      ( 9)
      (10)
      (11) 150
      *(12) H0OH18101*
      (13) <span class="vivbold qt0">HOOD</span>, RONALD E
      (14)
      (15)
      (16) 155
      *(17) H8OH07073*
      (18) <span class="vivbold qt0">HOOD</span>, RONALD EDWARD
      (19)
      (20)
      (21)
      (22)
      (23) H0oh18101, Hood, Ronald
      (24)
      (25) Edward, Hood, Ronald
      PL/SQL procedure successfully completed.

      You can see by looping through, oracle is displaying all the elements in url feed. From the above results, I need to display only items 12 and 17 and ignore the rest. I am also providing XML feed below:

      <?xml version="1.0" ?>
      - <vce>
      <param name="v:sources" value="cansumtest" />
      <param name="v:project" value="query-meta" />
      <param name="query" value="HOOD" />
      <param name="render.function" value="xml-feed-display" />
      <param name="content-type" value="text/xml" />
      - <added-source test-strictly="test-strictly" name="cansumtest" type="vse" modified="1355412925" over-request="1.3" num="200" status="queried" requested="200" query-xml-supported="query-xml-supported" vse-vse="vse-vse" total-results="2" admin-url="http://velocity/vivisimo/cgi-bin/admin" stem="depluralize" stem2="none" stoplist="none" search-ms="1" retrieval-ms="0" total-results-with-duplicates="2" retrieved="2">
      <parse url="http://127.0.0.1:7205/search?query-xml=%3cterm%20field%3d%22query%22%20str%3d%22HOOD%22%20position%3d%220%22%20processing%3d%22strict%22%20input-type%3d%22user%22%20%2f%3e&binning-mode=normal&force-binning=0&max=300&start=0&num=200&staging=0&collection=cansumtest&r-o-p=0&sort-keys=1&shingles=1&summarize=1&cache=1&cache-data=0&score=0&show-duplicates=0&gen-key=0&n-collapse=0&collapsed-binning=0" start-time="57" end-time="64" http-status="200 OK" status="fetched processed parsed" retrieved="2" />
      </added-source>
      - <list path="" num="2" level="0" start="0" per="10">
      - <document url="oracle://192.168.2.19:1521/orcl/?key-val=9" rank="0" source="cansumtest" score="0.111111" truncated-url="oracle://192.168.2.19:1521/orcl/?key-val=9" context="http://192.168.2.30/cgi-bin/query-meta?v%3afile=viv_MLtzUk&v%3astate=%28root%29%7croot&v%3aframe=tree&subquery=id%3aNdoc0&active%3d=root&v%3asubsearch=1&">
      <content name="size" type="text">150</content>
      <content name="CAND_ID" type="text">H0OH18101</content>
      <content name="CAND_NM" type="text"><span class="vivbold qt0">HOOD</span>, RONALD E</content>
      <content name="snippet" type="html" />
      </document>
      - <document url="oracle://192.168.2.19:1521/orcl/?key-val=4015" rank="1" source="cansumtest" score="0.1" truncated-url="oracle://192.168.2.19:1521/orcl/?key-val=4015" context="http://192.168.2.30/cgi-bin/query-meta?v%3afile=viv_MLtzUk&v%3astate=%28root%29%7croot&v%3aframe=tree&subquery=id%3aNdoc1&active%3d=root&v%3asubsearch=1&">
      <content name="size" type="text">155</content>
      <content name="CAND_ID" type="text">H8OH07073</content>
      <content name="CAND_NM" type="text"><span class="vivbold qt0">HOOD</span>, RONALD EDWARD</content>
      <content name="snippet" type="html" />
      </document>
      </list>
      - <tree base-url="http://192.168.2.30/cgi-bin/query-meta?v%3afile=viv_MLtzUk&v:state=" recluster-base-url="http://192.168.2.30/cgi-bin/query-meta?v%3afile=viv_MLtzUk&">
      - <node type="top" level="0" ndocs="2" active="1" subnodes="0" ts="|root" ls="root|root" bs="|root">
      - <node type="document" level="1" ndocs="1" ts="(root)|root" ls="root|N0" bs="(root)|N0">
      <description>H0oh18101, Hood, Ronald</description>
      </node>
      - <node type="document" level="1" ndocs="1" ts="(root)|root" ls="root|N1" bs="(root)|N1">
      <description>Edward, Hood, Ronald</description>
      </node>
      </node>
      </tree>
      </vce>
        • 1. Re: Parsing XML feed
          odie_63
          You can see by looping through, oracle is displaying all the elements in url feed. From the above results, I need to display only items 12 and 17 and ignore the rest.
          Stop using DOM and start using scalable and straightforward techniques such as XPath and XQuery :
          SQL> select *
            2  from xmltable(
            3         '/vce/list/document'
            4         passing xdburitype('/public/feed.xml').getxml()
            5         columns
            6           cand_id   varchar2(30)  path 'content[@name="CAND_ID"]'
            7       )
            8  ;
           
          CAND_ID
          ------------------------------
          H0OH18101
          H8OH07073
           
          For my test I stored your feed sample in the XML DB repository, but in your case you can directly use your source url and the httpuritype constructor :
          select *
          from xmltable(
                 '/vce/list/document'
                 passing httpuritype('http://192.168.2.30/cgi-bin/query-meta?v%3Asources=cansumtest&v%3Aproject=query-meta&query=HOOD&render.function=xml-feed-display&content-type=text/xml').getxml()
                 columns
                   cand_id   varchar2(30)  path 'content[@name="CAND_ID"]'
               )
          ;
          • 2. Re: Parsing XML feed
            skas
            This is awesome. So simple and works like a charm. One more thing, I have included one more column in my query which returns both ID and NAME but NAME column also returns html codes something like SMITT, <span class="\vivbold qt0"">ERIK</span>.

            How can we filter html and display actual value which is SMITT, ERIK

            Thanks
            • 3. Re: Parsing XML feed
              odie_63
              Could you post the original XML feed again? But this time do not copy/paste from your browser because it resolves all character entity references and produces invalid/confusing content.
              Use "View source" feature on your page, or simply open it with a text editor.

              When posting here, always use the &#x7b;code} tags to enclose code snippets, as explained here : https://forums.oracle.com/forums/help.jspa

              Seems like the html tags are not stored as mixed content (in which case they would have been automatically stripped in the COLUMNS clause) :
              <content name="CAND_NM" type="text"><span class="vivbold qt0">HOOD</span>, RONALD EDWARD</content>
              but more like this, which doesn't show in the XML since character entities have been unescaped by the browser :
              <content name="CAND_NM" type="text">&#38;lt;span class=&#38;quot;vivbold qt0&#38;quot;&#38;gt;HOOD&#38;lt;/span&#38;gt;, RONALD EDWARD</content>
              • 4. Re: Parsing XML feed
                skas
                I have extracted following codes from the view source of the page. As you can see it is returning some htmls also. I am getting this feed from a search engine and all the searched terms, in my case "HOOD", are coming back as BOLD. Any idea how to filter these htmls?
                      <content name="CAND_ID" type="text">H0OH18101</content>
                      <content name="CAND_NM" type="text">&lt;span class="vivbold qt0"&gt;HOOD&lt;/span&gt;, RONALD E</content>
                • 5. Re: Parsing XML feed
                  odie_63
                  The easiest way is probably to apply a regular expression, either by postprocessing the column :
                  SQL> select cand_id
                    2       , regexp_replace(cand_nm, '<[^>]+>') as cand_nm
                    3  from xmltable(
                    4         '/vce/list/document'
                    5         passing xdburitype('/public/feed.xml').getxml()
                    6         columns
                    7           cand_id   varchar2(30)  path 'content[@name="CAND_ID"]'
                    8         , cand_nm   varchar2(80)  path 'content[@name="CAND_NM"]'
                    9       )
                   10  ;
                   
                  CAND_ID                        CAND_NM
                  ------------------------------ --------------------------------------------------------------------------------
                  H0OH18101                      HOOD, RONALD E
                  H8OH07073                      HOOD, RONALD EDWARD
                   
                  or directly in XMLTable :
                  SQL> select cand_id
                    2       , cand_nm
                    3  from xmltable(
                    4         '/vce/list/document'
                    5         passing xdburitype('/public/feed.xml').getxml()
                    6         columns
                    7           cand_id   varchar2(30)  path 'content[@name="CAND_ID"]'
                    8         , cand_nm   varchar2(80)  path 'ora:replace(content[@name="CAND_NM"], "<[^>]+>", "")'
                    9       )
                   10  ;
                   
                  CAND_ID                        CAND_NM
                  ------------------------------ --------------------------------------------------------------------------------
                  H0OH18101                      HOOD, RONALD E
                  H8OH07073                      HOOD, RONALD EDWARD