This discussion is archived
3 Replies Latest reply: Dec 14, 2006 3:19 AM by 807607 RSS

Help urgently needed with xml reading

807607 Newbie
Currently Being Moderated
hi, i have a 3gig xml file which i am surprised it is so large as i have even shortened all of the names,
it contains 2000 elements which have a filename and an array, each element has an array of size 6000, and each element in this array has an array of shorts, say 100 of them, and also a filename.

i ahve used stax writing to be able to write such a large file without having memory problems.

i then had to program a stax reader to be able to read it all in without building the whole tree but with some large cases i am now getting out of memory again, it seems to eb from the read so maybe the code eve though it doesnt have to hold the tree in memory, it has to hold the file in memory, the code for reading is shown here:
package utilities;

import Categorise.Collection;
import Categorise.Comparison;
import Categorise.TestCollection;
import java.io.IOException;
import javax.xml.parsers.*;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.Attributes;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import measures.Protocol;

/**
 *
 * @author dthomas
 */
public class XMLParser extends DefaultHandler
{
    static Collection collection = new Collection();
    List<Short> cList;
    List<Comparison> comparisonList;
    File trainFileName;
    File testFileName;
    TestCollection tc;
    List<TestCollection> testCollectionList;
    List<File> testFileNameList = new ArrayList<File>();
    List<File> trainFileNameList = new ArrayList<File>();
    boolean allTrainsAdded = false;
    Protocol protocol;
    List<File> trainingDirList;
    File testingDir;
    int counter = 0;
    
    /** Creates a new instance of XMLParser */
    public XMLParser() {
    }
    
    public static Collection read( File aFile )
    {
        long startTime = System.currentTimeMillis();
        System.out.println( "Reading XML..." );
        SAXParserFactory spf = SAXParserFactory.newInstance();
        SAXParser sp;
        try {
            sp = spf.newSAXParser();

            sp.parse( aFile, new XMLParser() );
            
        } catch (IOException ex) {
            ex.printStackTrace();
        } catch (SAXException ex) {
            ex.printStackTrace();
        } catch (ParserConfigurationException ex) {
            ex.printStackTrace();
        }
        long endTime = System.currentTimeMillis();
        long totalTime = ( endTime - startTime ) / 1000;
        System.out.println( "Done..."  + totalTime + " seconds" );
        return collection;

    }
    
    public void startElement(String uri,String localName,String qName, Attributes attributes)
    {
       
        if( qName.equals( "RE" ) )
        {
            testCollectionList = new ArrayList<TestCollection>();
        }
        else if( qName.equals( "p") )
        {
            boolean isConcatenated = new Boolean( attributes.getValue( "c" ) );
            boolean isStatic = new Boolean( attributes.getValue( "s" ) );
            protocol = new Protocol( isConcatenated, isStatic );
        }
        else if( qName.equals( "trdl" ) )
        {
            trainingDirList = new ArrayList<File>();
        }
        else if( qName.equals( "trd" ) )
        {
            File trainDir = new File( attributes.getValue( "fn" ) );
            trainingDirList.add( trainDir );
        }
        else if( qName.equals( "td" ) )
        {
            testingDir = new File( attributes.getValue( "fn" ) );
        }
        else if( qName.equals( "TC" ) )
        {
            counter++;
            System.out.println( counter );
            comparisonList = new ArrayList<Comparison>();
           
            testFileName = new File( attributes.getValue( "tfn" ) );
            testFileNameList.add( testFileName );
        
            tc = new TestCollection( );
            
            tc.setTestFileName( testFileName );
        }
        else if ( qName.equals( "r" ) )
        {
         String order = attributes.getValue( "o" );
            String type = attributes.getValue( "t" );
            String value = attributes.getValue( "v" );

            cList.add( Short.parseShort( order ), new Short( value ) );
            
        }
        else if( qName.equals( "c" ) )
        {
            cList = new ArrayList<Short>();

            trainFileName = new File( attributes.getValue( "trfn" ) );
            if( !allTrainsAdded )
            {
                trainFileNameList.add( trainFileName );
            }
        }

    }
    public void characters(char []ch,int start,int length)
    {
        String str=new String(ch,start,length);
        System.out.print(str);
    }
    public void endElement(String uri,String localName,String qName)
    {
        if (qName.equals( "c") )
        {
            allTrainsAdded = true;
            short[ ] cCounts = new short[ cList.size() ];       
            for( int i = 0; i < cCounts.length; i++ )
            {
                cCounts[ i ] = cList.get( i );
            }
            
            Comparison c = new Comparison( trainFileName, tc );
            c.setcCounts( cCounts );
            this.comparisonList.add( c );

        }
        else if( qName.equals( "TC" ) )
        {
            Comparison[ ] comparisons = new Comparison[ comparisonList.size() ];
            comparisonList.toArray( comparisons );            
            
            tc.setComparisons( comparisons );
            
            testCollectionList.add( tc );
        }
        else if( qName.equals( "RE" ) )
        {
            TestCollection[ ] testCollections = new TestCollection[ testCollectionList.size() ];
            testCollectionList.toArray( testCollections );
            collection.setTestCollections( testCollections );
            
            File[ ] testFileNames = new File[ testFileNameList.size() ];
            testFileNameList.toArray( testFileNames );
            collection.setTestingFiles( testFileNames );
            
            File[ ] trainingFileNames = new File[ trainFileNameList.size() ];
            trainFileNameList.toArray( trainingFileNames );
            collection.setTrainingFiles( trainingFileNames );
                
            collection.setProtocol( protocol );
            
            File[ ] trainingDirs = new File[ trainingDirList.size() ];
            trainingDirList.toArray( trainingDirs );            
            collection.setTrainingDirs( trainingDirs );
            
            collection.setTestingDir( testingDir );
        }

    }
}
any help would be greatly greatly appreciated

thanks

Danny =)

ps with the amount of data contained, should the xml file be this large? and does ayone know the size difference of holding a filename as a File rather than a string and then creating the file when it is needed?