2 Replies Latest reply: Dec 19, 2006 10:37 AM by 807607 RSS

    stax reading /writing need help from xml guru plz

    807607
      hi, i have been told that stax reading /writing should involve no overhead and that is why i use it and i am now able to write my large data to file, but using my reader i seem to run out of memory, using netbeans profiler i ahve found that char[] seems to be the problem,
      by backtracing i ahve found that javax.xml.parser.SAXParser.parse calls the xerces packages which eventually leads to the char[ ], now my code for my reader is attatched here...
      package utilities;
      
      import Categorise.Collection;
      import Categorise.Comparison;
      import Categorise.TestCollection;
      import java.io.IOException;
      import javax.xml.parsers.*;
      import org.xml.sax.SAXException;
      import org.xml.sax.helpers.DefaultHandler;
      import org.xml.sax.Attributes;
      import java.io.File;
      import java.util.ArrayList;
      import java.util.List;
      import measures.Protocol;
      
      /**
       *
       * @author dthomas
       */
      public class XMLParser extends DefaultHandler
      {
          static Collection collection = new Collection();
          List<Short> cList;
          List<Comparison> comparisonList;
          File trainFileName;
          File testFileName;
          TestCollection tc;
          List<TestCollection> testCollectionList;
          List<File> testFileNameList = new ArrayList<File>();
          List<File> trainFileNameList = new ArrayList<File>();
      
          boolean allTrainsAdded = false;
          Protocol protocol;
          List<File> trainingDirList;
          File testingDir;
          int counter = 0;
          
          
          File[ ] trainingDirs;
          File[ ] trainingFileNames;
          File[ ] testFileNames;
          TestCollection[ ] testCollections;
          Comparison[ ] comparisons;
          Comparison c;
          short[ ] cCounts;
          String order;
          String value;
          File trainDir;
          
          /** Creates a new instance of XMLParser */
          public XMLParser() {
          }
          
          public static Collection read( File aFile )
          {
              long startTime = System.currentTimeMillis();
              System.out.println( "Reading XML..." );
              SAXParserFactory spf = SAXParserFactory.newInstance();
              SAXParser sp;
              try {
                  sp = spf.newSAXParser();
      
                  sp.parse( aFile, new XMLParser() );
                  
              } catch (IOException ex) {
                  ex.printStackTrace();
              } catch (SAXException ex) {
                  ex.printStackTrace();
              } catch (ParserConfigurationException ex) {
                  ex.printStackTrace();
              }
              long endTime = System.currentTimeMillis();
              long totalTime = ( endTime - startTime ) / 1000;
              System.out.println( "Done..."  + totalTime + " seconds" );
              return collection;
      
          }
          
          public void startElement(String uri,String localName,String qName, Attributes attributes)
          {
              if( qName.equals( "RE" ) )
              {
                  testCollectionList = new ArrayList<TestCollection>();
              }
              else if( qName.equals( "p") )
              {
                  boolean isConcatenated = new Boolean( attributes.getValue( "c" ) );
                  boolean isStatic = new Boolean( attributes.getValue( "s" ) );
                  protocol = new Protocol( isConcatenated, isStatic );
              }
              else if( qName.equals( "trdl" ) )
              {
                  trainingDirList = new ArrayList<File>();
              }
              else if( qName.equals( "trd" ) )
              {
                  trainDir = new File( attributes.getValue( "fn" ) );
                  trainingDirList.add( trainDir );
              }
              else if( qName.equals( "td" ) )
              {
                  testingDir = new File( attributes.getValue( "fn" ) );
              }
              else if( qName.equals( "TC" ) )
              {
                  counter++;
                  System.out.println( counter );
                  comparisonList = new ArrayList<Comparison>();
                 
                  testFileName = new File( attributes.getValue( "tfn" ) );
                  testFileNameList.add( testFileName );
                  
                  tc = new TestCollection( );
      
                  tc.setTestFileName( testFileName );
              }
              else if ( qName.equals( "r" ) )
              {
               order = attributes.getValue( "o" );
      
                  value = attributes.getValue( "v" );
      
                  cList.add( Short.parseShort( order ), new Short( value ) );
                  
              }
              else if( qName.equals( "c" ) )
              {
                  cList = new ArrayList<Short>();
      
                  trainFileName = new File( attributes.getValue( "trfn" ) );
                  if( !allTrainsAdded )
                  {
                      trainFileNameList.add( trainFileName );
                  }
              }
      
          }
          
          public void characters(char []ch,int start,int length)
          {
              //String str=new String(ch,start,length);
              //System.out.print(str);
          }
          
          public void endElement(String uri,String localName,String qName)
          {
              if (qName.equals( "c") )
              {
                  allTrainsAdded = true;
                  cCounts = new short[ cList.size() ];       
                  for( int i = 0; i < cCounts.length; i++ )
                  {
                      cCounts[ i ] = cList.get( i );
                  }
                  
                  c = new Comparison( trainFileName, tc );
                  c.setcCounts( cCounts );
                  this.comparisonList.add( c );
      
              }
              else if( qName.equals( "TC" ) )
              {
                  comparisons = new Comparison[ comparisonList.size() ];
                  comparisonList.toArray( comparisons );            
                  
                  tc.setComparisons( comparisons );
                  
                  testCollectionList.add( tc );
              }
              else if( qName.equals( "RE" ) )
              {
                  testCollections = new TestCollection[ testCollectionList.size() ];
                  testCollectionList.toArray( testCollections );
                  collection.setTestCollections( testCollections );
                  
                  testFileNames = new File[ testFileNameList.size() ];
                  testFileNameList.toArray( testFileNames );
                  collection.setTestingFiles( testFileNames );
                  
                  //String[ ] testCategories = new String[ testCategoryList.size() ];
                  //testCategoryList.toArray( testCategories );
                  //collection.setTestCategories( testCategories );
                  
                  trainingFileNames = new File[ trainFileNameList.size() ];
                  trainFileNameList.toArray( trainingFileNames );
                  collection.setTrainingFiles( trainingFileNames );
                  
                  //String[ ] trainingCategories = new String[ trainCategoryList.size() ];
                  //trainCategoryList.toArray( trainingCategories );
                  //collection.setTrainingCategories( trainingCategories );
                  
                  collection.setProtocol( protocol );
                  
                  trainingDirs = new File[ trainingDirList.size() ];
                  trainingDirList.toArray( trainingDirs );            
                  collection.setTrainingDirs( trainingDirs );
                  
                  collection.setTestingDir( testingDir );
              }
           //else
               //System.out.println("End element:   {" + uri + "}" + localName);
      
          }
      }
      i thought it may have been a recursive problme, hence having so many instance variables instead of local ones but that hasn't helped.

      all i need at the end of this is a Collection which holds an array of testCollections, which holds an array of cCounts and i was able to hold all of this in memory as all i am loading is what was in memory before it was written.

      can someone plz help

      ps when i use tail in unix to read the end of the xml file it doesnt work correctly as i cannot specify the number of lines to show, it shows all of the file as thought it is not split into lines or contains new line chars or anything, it is stored as one long stream, is this correct??

      here is a snippet of the xml file:

      <TC tfn="
      /homedir/dthomas/Desktop/4News2/output/split3/alt.atheism/53458"><c trfn="/homed
      ir/dthomas/Desktop/4News2/output/split0/alt.atheism/53586"><r o="0" v="0"></r><r
      o="1" v="724"></r><r o="2" v="640"></r><r o="3" v="413"></r><r o="4" v="245"></
      r><r o="5" v="148"></r><r o="6" v="82"></r><r o="7" v="52"></r><r o="8" v="40"><
      /r><r o="9" v="30"></r><r o="10" v="22"></r><r o="11" v="16"></r><r o="12" v="11
      "></r><r o="13" v="8"></r><r o="14" v="5"></r><r o="15" v="2"></r></c><c trfn="/
      homedir/dthomas/Desktop/4News2/output/split0/alt.atheism/53495"><r o="0" v="0"><
      /r><r o="1" v="720"></r><r o="2" v="589"></r><r o="3" v="349"></r><r o="


      please if anyone has any ideas from this code why a char[] would use 50% of the memory taken, and that the average age seems to show that the same one continues to grow..

      thanks in advance

      danny =)