Subversion Repositories general

Compare Revisions

Ignore whitespace Rev 1343 → Rev 1344

/xmlparser_java/trunk/XmlParser.java
1,6 → 1,6
/*
* input 2072160977 bytes; AMD Athlon 3100+, disk speed 38.0 MB/s
* 49.18s user 10.07s system 89% cpu 1:06.32 total; RES 9384K (empty java - 8200K)
* 137.05s user 10.79s system 89% cpu 2:45.85 total; RES 9420K (empty java - 8200K)
*/
import java.io.*;
 
77,27 → 77,133
}
 
// --------------------------------------------------------------------------------------------------------------------
class XmlInputStream
extends InputStream
{
private InputStream origin;
private long pos = -1;
private long marked = 0;
private int markedRead = -1;
private int lastRead = -1;
private boolean useLast = false;
 
public XmlInputStream(InputStream origin)
{
this.origin = new BufferedInputStream(origin, 1024);
}
 
public long getPosition()
{
return pos;
}
 
public long getNextPosition()
{
return (useLast ? pos : pos+1);
}
 
public boolean eof()
{
return (pos > 0 && lastRead < 0);
}
 
public void back()
{
useLast = true;
}
 
public int read()
throws IOException
{
if(useLast) {
useLast = false;
}
else {
lastRead = origin.read();
++pos;
}
 
return lastRead;
}
 
public int read(byte[] b, int off, int len)
throws IOException
{
throw new IOException("not supported");
}
 
public long skip(long n)
throws IOException
{
long res = origin.skip(n);
 
if(res > 0) {
pos += res;
}
 
return res;
}
 
public int available()
throws IOException
{
return origin.available();
}
 
public void close()
throws IOException
{
origin.close();
}
 
public boolean markSupported()
{
return origin.markSupported();
}
 
public void mark()
{
mark(1024);
}
 
public void mark(int readlimit)
{
marked = pos;
if(useLast) {
markedRead = lastRead;
}
else {
markedRead = -1;
}
origin.mark(readlimit);
}
 
public void reset()
throws IOException
{
origin.reset();
pos = marked;
if(markedRead >= 0) {
useLast = true;
lastRead = markedRead;
}
else {
useLast = false;
}
}
}
 
// --------------------------------------------------------------------------------------------------------------------
class XmlDocument
{
private InputStream in;
private byte[] buf = new byte[2048]; // (buf.length % 16 == 0)
private int bufLen;
private int bufPos;
private long bufOffset;
private long line;
private long linePos;
private XmlInputStream in;
private long line;
private long linePos;
 
public void parse(InputStream in)
throws XmlException, IOException
{
if(buf.length % 16 != 0) {
throwException("wrong buffer size: " + buf.length);
}
this.in = in;
this.bufLen = 0;
this.bufPos = 0;
this.bufOffset = 0;
this.in = new XmlInputStream(in);
this.line = 1;
this.linePos = 0;
114,7 → 220,7
private void throwException(String message)
throws XmlException, IOException
{
throw new XmlException(message, line, /*linePos*/ bufPos);
throw new XmlException(message, line, /*linePos*/ in.getPosition());
}
 
private void log(String message)
125,49 → 231,20
private String toString(XmlSelection sel)
throws IOException
{
return new String(buf, (int)(sel.begin - bufOffset), (int)sel.getLength(), "UTF-8");
//return new String(buf, (int)(sel.begin - bufOffset), (int)sel.getLength(), "UTF-8");
return (sel.begin + "," + (int)sel.getLength());
}
private void saveSelBegin(XmlSelection sel)
{
sel.begin = bufOffset + bufPos;
sel.begin = in.getNextPosition();
}
private void saveSelEnd(XmlSelection sel)
{
sel.end = bufOffset + bufPos;
sel.end = in.getPosition();
}
private boolean ensureNext(int count)
throws XmlException, IOException
{
if(bufPos + count >= bufLen) {
//log("ensureNext start " + bufPos + " " + count);
if(bufLen == 0) {
// read full buffer at begin
bufLen = in.read(buf);
bufPos = 0;
}
else if(bufLen < buf.length) {
// we could not fill full buffer last time - no more data
return false;
}
else {
// move last 1/16 of data to begin, fill rest with new data
System.arraycopy(buf, buf.length / 16 * 15, buf, 0, buf.length / 16);
int read = in.read(buf, buf.length / 16, buf.length / 16 * 15);
bufLen = buf.length / 16 + read;
bufPos -= buf.length / 16 * 15;
bufOffset += buf.length / 16 * 15;
}
return (bufPos + count < bufLen);
}
else {
return true;
}
}
private boolean parseProlog()
throws XmlException, IOException
{
183,23 → 260,24
private boolean skipSpaces()
throws XmlException, IOException
{
//log("skipSpaces begin " + bufPos);
//log("skipSpaces begin " + in.getPosition());
boolean found = false;
for(;;) {
if(bufPos >= bufLen && !ensureNext(1)) break;
byte c = buf[bufPos];
if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
int c = in.read();
if(c < 0) {
break;
}
else if(c == ' ' || c == '\t' || c == '\n' || c == '\r') {
found = true;
++bufPos;
}
else {
in.back();
break;
}
}
//log("skipSpaces " + found + " " + bufPos);
//log("skipSpaces " + found + " " + in.getPosition());
return found;
}
 
206,13 → 284,13
private boolean parseDecl()
throws XmlException, IOException
{
//log("parseDecl begin " + bufPos);
//log("parseDecl begin " + in.getPosition());
// begin
if(!testChar('<', bufPos) || !testChar('?', bufPos) || !testChar('x', bufPos)
|| !testChar('m', bufPos) || !testChar('l', bufPos))
if(!testChar('<') || !testChar('?') || !testChar('x')
|| !testChar('m') || !testChar('l'))
{
//log("parseDecl no 'xml' " + bufPos);
//log("parseDecl no 'xml' " + in.getPosition());
return false;
}
225,11 → 303,11
}
 
// end
if(!testChar('?', bufPos) || !testChar('>', bufPos)) {
if(!testChar('?') || !testChar('>')) {
throwException("end of XML declaration expected");
}
//log("parseDecl ok " + bufPos);
//log("parseDecl ok " + in.getPosition());
return true;
}
 
244,58 → 322,58
private boolean parseName(XmlSelection sel)
throws XmlException, IOException
{
//log("parseName begin " + bufPos);
//log("parseName begin " + in.getPosition());
saveSelBegin(sel);
int start = bufPos;
in.mark();
if(bufPos >= bufLen && !ensureNext(1)) {
int c = in.read();
if(c < 0) {
return false;
}
byte c = buf[bufPos];
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == ':') {
++bufPos;
else if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == ':') {
}
else {
//log("parseName not a name " + bufPos);
bufPos = start;
//log("parseName not a name " + in.getPosition());
in.reset();
return false;
}
for(;;) {
if(bufPos >= bufLen && !ensureNext(1)) {
c = in.read();
if(c < 0) {
throwException("unexpected EoF");
}
c = buf[bufPos];
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9')
else if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9')
|| c == '.' || c == '-' || c == '_' || c == ':')
{
++bufPos;
}
else {
in.back();
break;
}
}
saveSelEnd(sel);
//log("parseName ok " + bufPos);
//log("parseName ok " + in.getPosition());
return true;
}
 
private boolean testChar(char c, int rollback)
private boolean testChar(char c)
throws XmlException, IOException
{
if(bufPos >= bufLen && !ensureNext(1)) {
int cc = in.read();
 
if(cc < 0) {
//log("testChar eof " + in.getPosition() + " [" + c + "]");
return false;
}
if(buf[bufPos] != c) {
bufPos = rollback;
else if(cc != c) {
//log("testChar false " + in.getPosition() + " [" + c + "] [" + (char)cc + "]");
in.back();
return false;
}
else {
++bufPos;
//log("testChar true " + in.getPosition() + " [" + c + "] [" + (char)cc + "]");
return true;
}
}
303,16 → 381,16
private boolean parseAttribute(XmlSelection selName, XmlSelection selValue)
throws XmlException, IOException
{
//log("parseAttribute begin " + bufPos);
//log("parseAttribute begin " + in.getPosition());
// name
if(!parseName(selName)) {
//log("parseAttribute no name " + bufPos);
//log("parseAttribute no name " + in.getPosition());
return false;
}
// eq
skipSpaces();
if(!testChar('=', bufPos)) {
if(!testChar('=')) {
throwException("equal sign expected");
}
skipSpaces();
320,30 → 398,30
// FIXME allow 'Reference' here
// value
if(!testChar('"', bufPos)) {
if(!testChar('"')) {
throwException("quoted string expected");
}
saveSelBegin(selValue);
for(;; ++bufPos) {
if(bufPos >= bufLen && !ensureNext(1)) {
for(;;) {
int c = in.read();
if(c < 0) {
throwException("unexpected EoF");
}
byte c = buf[bufPos];
if(c == '<' || c == '&' || c == '"') {
else if(c == '<' || c == '&' || c == '"') {
in.back();
break;
}
}
saveSelEnd(selValue);
if(!testChar('"', bufPos)) {
if(!testChar('"')) {
throwException("end of quoted string expected");
}
// FIXME check '[WFC: No External Entity References]'
//log("parseAttribute ok " + bufPos);
//log("parseAttribute ok " + in.getPosition());
return true;
}
 
350,21 → 428,21
private boolean parseMisc()
throws XmlException, IOException
{
//log("parseMisc begin " + bufPos);
//log("parseMisc begin " + in.getPosition());
if(parseComment()) {
//log("parseMisc comment ok " + bufPos);
//log("parseMisc comment ok " + in.getPosition());
return true;
}
else if(parseProcessInstruction()) {
//log("parseMisc pi ok " + bufPos);
//log("parseMisc pi ok " + in.getPosition());
return true;
}
else if(skipSpaces()) {
//log("parseMisc spaces ok " + bufPos);
//log("parseMisc spaces ok " + in.getPosition());
return true;
}
else {
//log("parseMisc false " + bufPos);
//log("parseMisc false " + in.getPosition());
return false;
}
}
372,26 → 450,18
private boolean parseComment()
throws XmlException, IOException
{
//log("parseComment begin " + bufPos);
if(bufPos+3 >= bufLen && !ensureNext(4)) {
//log("parseComment no data " + bufPos);
//log("parseComment begin " + in.getPosition());
in.mark();
if(!testChar('<') || !testChar('!') || !testChar('-') || !testChar('-')) {
in.reset();
//log("parseComment no signature " + in.getPosition());
return false;
}
if(buf[bufPos] != '<' || buf[bufPos+1] != '!' || buf[bufPos+2] != '-' || buf[bufPos+3] != '-') {
//log("parseComment no signature " + bufPos);
return false;
}
bufPos += 4;
for(;; ++bufPos) {
if(bufPos+2 >= bufLen && !ensureNext(3)) {
throwException("unexpected EoF");
}
if(buf[bufPos] == '-' && buf[bufPos+1] == '-') {
if(buf[bufPos+2] == '>') {
bufPos += 3;
//log("parseComment ok " + bufPos);
for(; !in.eof(); in.read()) {
if(testChar('-') && testChar('-')) {
if(testChar('>')) {
//log("parseComment ok " + in.getPosition());
return true;
}
else {
399,40 → 469,42
}
}
}
 
throwException("unexpected EoF");
return false;
}
 
private boolean parseProcessInstruction()
throws XmlException, IOException
{
if(bufPos+1 >= bufLen && !ensureNext(2)) {
//log("parseProcessInstruction begin " + in.getPosition());
in.mark();
if(!testChar('<') || !testChar('?')) {
in.reset();
//log("parseProcessInstruction no signature " + in.getPosition());
return false;
}
if(buf[bufPos] != '<' || buf[bufPos+1] != '?') {
return false;
}
bufPos += 2;
for(;; ++bufPos) {
if(bufPos+1 >= bufLen && !ensureNext(2)) {
throwException("unexpected EoF");
}
 
if(buf[bufPos] == '?' && buf[bufPos+1] == '>') {
bufPos += 3;
for(; !in.eof(); in.read()) {
if(testChar('?') && testChar('>')) {
//log("parseProcessInstruction ok " + in.getPosition());
return true;
}
}
 
throwException("unexpected EoF");
return false;
}
 
private XmlElement parseElement()
throws XmlException, IOException
{
//log("parseElement begin " + bufPos);
//log("parseElement begin " + in.getPosition());
 
XmlElement element = new XmlElement();
if(!parseStartTag(element)) {
//log("parseElement no start tag " + bufPos);
//log("parseElement no start tag " + in.getPosition());
return null;
}
448,7 → 520,7
// throwException("tag names do not match");
}
//log("parseElement ok " + bufPos);
//log("parseElement ok " + in.getPosition());
return element;
}
 
455,12 → 527,12
private boolean parseStartTag(XmlElement element)
throws XmlException, IOException
{
//log("parseStartTag begin " + bufPos);
int start = bufPos;
//log("parseStartTag begin " + in.getPosition());
in.mark();
// begin
if(!testChar('<', bufPos)) {
//log("parseStartTag no signature " + bufPos);
if(!testChar('<')) {
//log("parseStartTag no signature " + in.getPosition());
return false;
}
467,8 → 539,8
// name
XmlSelection sel = new XmlSelection();
if(!parseName(sel)) {
//log("parseStartTag no name " + bufPos);
bufPos = start;
//log("parseStartTag no name " + in.getPosition());
in.reset();
return false;
}
//System.out.print("[" + toString(sel) + "]");
484,12 → 556,12
//System.out.println();
// end
element.isEmpty = testChar('/', bufPos);
if(!testChar('>', bufPos)) {
element.isEmpty = testChar('/');
if(!testChar('>')) {
throwException("end of tag expected");
}
//log("parseStartTag ok " + bufPos);
//log("parseStartTag ok " + in.getPosition());
return true;
}
 
496,9 → 568,9
private void parseEndTag(XmlSelection sel)
throws XmlException, IOException
{
//log("parseEndTag begin " + bufPos);
//log("parseEndTag begin " + in.getPosition());
// begin
if(!testChar('<', bufPos) || !testChar('/', bufPos)) {
if(!testChar('<') || !testChar('/')) {
throwException("cannot find tag end");
}
511,23 → 583,28
skipSpaces();
// end
if(!testChar('>', bufPos)) {
if(!testChar('>')) {
throwException("end of tag expected");
}
//log("parseEndTag ok " + bufPos);
//log("parseEndTag ok " + in.getPosition());
}
 
private boolean parseTagContent(XmlElement element)
throws XmlException, IOException
{
//log("parseTagContent begin " + bufPos);
//log("parseTagContent begin " + in.getPosition());
XmlSelection sel = new XmlSelection();
for(;;) {
if(bufPos+1 >= bufLen && !ensureNext(2)) {
if(in.eof()) {
throwException("unexpected EoF");
}
if(buf[bufPos] == '<' && buf[bufPos+1] == '/') break;
in.mark();
if(testChar('<') && testChar('/')) {
in.reset();
break;
}
in.reset();
if(parseElement() != null) {
}
543,7 → 620,7
// FIXME allow 'Reference' here
}
//log("parseTagContent ok " + bufPos);
//log("parseTagContent ok " + in.getPosition());
return true;
}
 
551,17 → 628,17
private boolean parseCharData(XmlSelection sel)
throws XmlException, IOException
{
//log("parseCharData begin " + bufPos);
//log("parseCharData begin " + in.getPosition());
boolean found = false;
saveSelBegin(sel);
for(;; ++bufPos) {
if(bufPos >= bufLen && !ensureNext(1)) {
for(;;) {
int c = in.read();
if(c < 0) {
return false;
}
byte c = buf[bufPos];
if(c == '<') {
else if(c == '<') {
in.back();
break;
}
568,7 → 645,7
found = true;
}
//log("parseCharData " + found + " " + bufPos);
//log("parseCharData " + found + " " + in.getPosition());
saveSelEnd(sel);
return found;
}
576,31 → 653,27
private boolean parseCData(XmlSelection sel)
throws XmlException, IOException
{
if(bufPos+8 >= bufLen && !ensureNext(9)) {
return false;
}
if(buf[bufPos] != '<' || buf[bufPos+1] != '!'
|| buf[bufPos+2] != '[' || buf[bufPos+3] != 'C'
|| buf[bufPos+4] != 'D' || buf[bufPos+5] != 'A'
|| buf[bufPos+6] != 'T' || buf[bufPos+7] != 'A'
|| buf[bufPos+8] != '[')
in.mark();
if(!testChar('<') || !testChar('!')
|| !testChar('[') || !testChar('C')
|| !testChar('D') || !testChar('A')
|| !testChar('T') || !testChar('A')
|| !testChar('['))
{
in.reset();
return false;
}
bufPos += 9;
saveSelBegin(sel);
for(;; ++bufPos) {
if(bufPos+2 >= bufLen && !ensureNext(3)) {
throwException("unexpected EoF");
}
 
if(buf[bufPos] == ']' && buf[bufPos+1] == ']' && buf[bufPos+2] == '>') {
for(; !in.eof(); in.read()) {
if(testChar(']') && testChar(']') && testChar('>')) {
saveSelEnd(sel);
bufPos += 3;
return true;
}
}
 
throwException("unexpected EoF");
return false;
}
}