1,6 → 1,6 |
/* |
* input 2072160977 bytes; AMD Athlon 3100+, disk speed 38.0 MB/s |
* 49.18s user 10.07s system 89% cpu 1:06.32 total; RES 9384K (empty java - 8200K) |
* 137.05s user 10.79s system 89% cpu 2:45.85 total; RES 9420K (empty java - 8200K) |
*/ |
import java.io.*; |
|
77,27 → 77,133 |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlInputStream |
extends InputStream |
{ |
private InputStream origin; |
private long pos = -1; |
private long marked = 0; |
private int markedRead = -1; |
private int lastRead = -1; |
private boolean useLast = false; |
|
public XmlInputStream(InputStream origin) |
{ |
this.origin = new BufferedInputStream(origin, 1024); |
} |
|
public long getPosition() |
{ |
return pos; |
} |
|
public long getNextPosition() |
{ |
return (useLast ? pos : pos+1); |
} |
|
public boolean eof() |
{ |
return (pos > 0 && lastRead < 0); |
} |
|
public void back() |
{ |
useLast = true; |
} |
|
public int read() |
throws IOException |
{ |
if(useLast) { |
useLast = false; |
} |
else { |
lastRead = origin.read(); |
++pos; |
} |
|
return lastRead; |
} |
|
public int read(byte[] b, int off, int len) |
throws IOException |
{ |
throw new IOException("not supported"); |
} |
|
public long skip(long n) |
throws IOException |
{ |
long res = origin.skip(n); |
|
if(res > 0) { |
pos += res; |
} |
|
return res; |
} |
|
public int available() |
throws IOException |
{ |
return origin.available(); |
} |
|
public void close() |
throws IOException |
{ |
origin.close(); |
} |
|
public boolean markSupported() |
{ |
return origin.markSupported(); |
} |
|
public void mark() |
{ |
mark(1024); |
} |
|
public void mark(int readlimit) |
{ |
marked = pos; |
if(useLast) { |
markedRead = lastRead; |
} |
else { |
markedRead = -1; |
} |
origin.mark(readlimit); |
} |
|
public void reset() |
throws IOException |
{ |
origin.reset(); |
pos = marked; |
if(markedRead >= 0) { |
useLast = true; |
lastRead = markedRead; |
} |
else { |
useLast = false; |
} |
} |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlDocument |
{ |
private InputStream in; |
private byte[] buf = new byte[2048]; // (buf.length % 16 == 0) |
private int bufLen; |
private int bufPos; |
private long bufOffset; |
private long line; |
private long linePos; |
private XmlInputStream in; |
private long line; |
private long linePos; |
|
public void parse(InputStream in) |
throws XmlException, IOException |
{ |
if(buf.length % 16 != 0) { |
throwException("wrong buffer size: " + buf.length); |
} |
|
this.in = in; |
this.bufLen = 0; |
this.bufPos = 0; |
this.bufOffset = 0; |
this.in = new XmlInputStream(in); |
this.line = 1; |
this.linePos = 0; |
|
114,7 → 220,7 |
private void throwException(String message) |
throws XmlException, IOException |
{ |
throw new XmlException(message, line, /*linePos*/ bufPos); |
throw new XmlException(message, line, /*linePos*/ in.getPosition()); |
} |
|
private void log(String message) |
125,49 → 231,20 |
private String toString(XmlSelection sel) |
throws IOException |
{ |
return new String(buf, (int)(sel.begin - bufOffset), (int)sel.getLength(), "UTF-8"); |
//return new String(buf, (int)(sel.begin - bufOffset), (int)sel.getLength(), "UTF-8"); |
return (sel.begin + "," + (int)sel.getLength()); |
} |
|
private void saveSelBegin(XmlSelection sel) |
{ |
sel.begin = bufOffset + bufPos; |
sel.begin = in.getNextPosition(); |
} |
|
private void saveSelEnd(XmlSelection sel) |
{ |
sel.end = bufOffset + bufPos; |
sel.end = in.getPosition(); |
} |
|
private boolean ensureNext(int count) |
throws XmlException, IOException |
{ |
if(bufPos + count >= bufLen) { |
//log("ensureNext start " + bufPos + " " + count); |
if(bufLen == 0) { |
// read full buffer at begin |
bufLen = in.read(buf); |
bufPos = 0; |
} |
else if(bufLen < buf.length) { |
// we could not fill full buffer last time - no more data |
return false; |
} |
else { |
// move last 1/16 of data to begin, fill rest with new data |
System.arraycopy(buf, buf.length / 16 * 15, buf, 0, buf.length / 16); |
int read = in.read(buf, buf.length / 16, buf.length / 16 * 15); |
bufLen = buf.length / 16 + read; |
bufPos -= buf.length / 16 * 15; |
bufOffset += buf.length / 16 * 15; |
} |
|
return (bufPos + count < bufLen); |
} |
else { |
return true; |
} |
} |
|
private boolean parseProlog() |
throws XmlException, IOException |
{ |
183,23 → 260,24 |
private boolean skipSpaces() |
throws XmlException, IOException |
{ |
//log("skipSpaces begin " + bufPos); |
//log("skipSpaces begin " + in.getPosition()); |
boolean found = false; |
|
for(;;) { |
if(bufPos >= bufLen && !ensureNext(1)) break; |
|
byte c = buf[bufPos]; |
if(c == ' ' || c == '\t' || c == '\n' || c == '\r') { |
int c = in.read(); |
if(c < 0) { |
break; |
} |
else if(c == ' ' || c == '\t' || c == '\n' || c == '\r') { |
found = true; |
++bufPos; |
} |
else { |
in.back(); |
break; |
} |
} |
|
//log("skipSpaces " + found + " " + bufPos); |
//log("skipSpaces " + found + " " + in.getPosition()); |
return found; |
} |
|
206,13 → 284,13 |
private boolean parseDecl() |
throws XmlException, IOException |
{ |
//log("parseDecl begin " + bufPos); |
//log("parseDecl begin " + in.getPosition()); |
|
// begin |
if(!testChar('<', bufPos) || !testChar('?', bufPos) || !testChar('x', bufPos) |
|| !testChar('m', bufPos) || !testChar('l', bufPos)) |
if(!testChar('<') || !testChar('?') || !testChar('x') |
|| !testChar('m') || !testChar('l')) |
{ |
//log("parseDecl no 'xml' " + bufPos); |
//log("parseDecl no 'xml' " + in.getPosition()); |
return false; |
} |
|
225,11 → 303,11 |
} |
|
// end |
if(!testChar('?', bufPos) || !testChar('>', bufPos)) { |
if(!testChar('?') || !testChar('>')) { |
throwException("end of XML declaration expected"); |
} |
|
//log("parseDecl ok " + bufPos); |
//log("parseDecl ok " + in.getPosition()); |
return true; |
} |
|
244,58 → 322,58 |
private boolean parseName(XmlSelection sel) |
throws XmlException, IOException |
{ |
//log("parseName begin " + bufPos); |
//log("parseName begin " + in.getPosition()); |
saveSelBegin(sel); |
int start = bufPos; |
in.mark(); |
|
if(bufPos >= bufLen && !ensureNext(1)) { |
int c = in.read(); |
if(c < 0) { |
return false; |
} |
|
byte c = buf[bufPos]; |
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == ':') { |
++bufPos; |
else if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == ':') { |
} |
else { |
//log("parseName not a name " + bufPos); |
bufPos = start; |
//log("parseName not a name " + in.getPosition()); |
in.reset(); |
return false; |
} |
|
for(;;) { |
if(bufPos >= bufLen && !ensureNext(1)) { |
c = in.read(); |
if(c < 0) { |
throwException("unexpected EoF"); |
} |
|
c = buf[bufPos]; |
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') |
else if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') |
|| c == '.' || c == '-' || c == '_' || c == ':') |
{ |
++bufPos; |
} |
else { |
in.back(); |
break; |
} |
} |
saveSelEnd(sel); |
|
//log("parseName ok " + bufPos); |
//log("parseName ok " + in.getPosition()); |
return true; |
} |
|
private boolean testChar(char c, int rollback) |
private boolean testChar(char c) |
throws XmlException, IOException |
{ |
if(bufPos >= bufLen && !ensureNext(1)) { |
int cc = in.read(); |
|
if(cc < 0) { |
//log("testChar eof " + in.getPosition() + " [" + c + "]"); |
return false; |
} |
|
if(buf[bufPos] != c) { |
bufPos = rollback; |
else if(cc != c) { |
//log("testChar false " + in.getPosition() + " [" + c + "] [" + (char)cc + "]"); |
in.back(); |
return false; |
} |
else { |
++bufPos; |
//log("testChar true " + in.getPosition() + " [" + c + "] [" + (char)cc + "]"); |
return true; |
} |
} |
303,16 → 381,16 |
private boolean parseAttribute(XmlSelection selName, XmlSelection selValue) |
throws XmlException, IOException |
{ |
//log("parseAttribute begin " + bufPos); |
//log("parseAttribute begin " + in.getPosition()); |
// name |
if(!parseName(selName)) { |
//log("parseAttribute no name " + bufPos); |
//log("parseAttribute no name " + in.getPosition()); |
return false; |
} |
|
// eq |
skipSpaces(); |
if(!testChar('=', bufPos)) { |
if(!testChar('=')) { |
throwException("equal sign expected"); |
} |
skipSpaces(); |
320,30 → 398,30 |
// FIXME allow 'Reference' here |
|
// value |
if(!testChar('"', bufPos)) { |
if(!testChar('"')) { |
throwException("quoted string expected"); |
} |
|
saveSelBegin(selValue); |
for(;; ++bufPos) { |
if(bufPos >= bufLen && !ensureNext(1)) { |
for(;;) { |
int c = in.read(); |
if(c < 0) { |
throwException("unexpected EoF"); |
} |
|
byte c = buf[bufPos]; |
if(c == '<' || c == '&' || c == '"') { |
else if(c == '<' || c == '&' || c == '"') { |
in.back(); |
break; |
} |
} |
saveSelEnd(selValue); |
|
if(!testChar('"', bufPos)) { |
if(!testChar('"')) { |
throwException("end of quoted string expected"); |
} |
|
// FIXME check '[WFC: No External Entity References]' |
|
//log("parseAttribute ok " + bufPos); |
//log("parseAttribute ok " + in.getPosition()); |
return true; |
} |
|
350,21 → 428,21 |
private boolean parseMisc() |
throws XmlException, IOException |
{ |
//log("parseMisc begin " + bufPos); |
//log("parseMisc begin " + in.getPosition()); |
if(parseComment()) { |
//log("parseMisc comment ok " + bufPos); |
//log("parseMisc comment ok " + in.getPosition()); |
return true; |
} |
else if(parseProcessInstruction()) { |
//log("parseMisc pi ok " + bufPos); |
//log("parseMisc pi ok " + in.getPosition()); |
return true; |
} |
else if(skipSpaces()) { |
//log("parseMisc spaces ok " + bufPos); |
//log("parseMisc spaces ok " + in.getPosition()); |
return true; |
} |
else { |
//log("parseMisc false " + bufPos); |
//log("parseMisc false " + in.getPosition()); |
return false; |
} |
} |
372,26 → 450,18 |
private boolean parseComment() |
throws XmlException, IOException |
{ |
//log("parseComment begin " + bufPos); |
if(bufPos+3 >= bufLen && !ensureNext(4)) { |
//log("parseComment no data " + bufPos); |
//log("parseComment begin " + in.getPosition()); |
in.mark(); |
if(!testChar('<') || !testChar('!') || !testChar('-') || !testChar('-')) { |
in.reset(); |
//log("parseComment no signature " + in.getPosition()); |
return false; |
} |
if(buf[bufPos] != '<' || buf[bufPos+1] != '!' || buf[bufPos+2] != '-' || buf[bufPos+3] != '-') { |
//log("parseComment no signature " + bufPos); |
return false; |
} |
|
bufPos += 4; |
for(;; ++bufPos) { |
if(bufPos+2 >= bufLen && !ensureNext(3)) { |
throwException("unexpected EoF"); |
} |
|
if(buf[bufPos] == '-' && buf[bufPos+1] == '-') { |
if(buf[bufPos+2] == '>') { |
bufPos += 3; |
//log("parseComment ok " + bufPos); |
for(; !in.eof(); in.read()) { |
if(testChar('-') && testChar('-')) { |
if(testChar('>')) { |
//log("parseComment ok " + in.getPosition()); |
return true; |
} |
else { |
399,40 → 469,42 |
} |
} |
} |
|
throwException("unexpected EoF"); |
return false; |
} |
|
private boolean parseProcessInstruction() |
throws XmlException, IOException |
{ |
if(bufPos+1 >= bufLen && !ensureNext(2)) { |
//log("parseProcessInstruction begin " + in.getPosition()); |
in.mark(); |
if(!testChar('<') || !testChar('?')) { |
in.reset(); |
//log("parseProcessInstruction no signature " + in.getPosition()); |
return false; |
} |
if(buf[bufPos] != '<' || buf[bufPos+1] != '?') { |
return false; |
} |
|
bufPos += 2; |
for(;; ++bufPos) { |
if(bufPos+1 >= bufLen && !ensureNext(2)) { |
throwException("unexpected EoF"); |
} |
|
if(buf[bufPos] == '?' && buf[bufPos+1] == '>') { |
bufPos += 3; |
for(; !in.eof(); in.read()) { |
if(testChar('?') && testChar('>')) { |
//log("parseProcessInstruction ok " + in.getPosition()); |
return true; |
} |
} |
|
throwException("unexpected EoF"); |
return false; |
} |
|
private XmlElement parseElement() |
throws XmlException, IOException |
{ |
//log("parseElement begin " + bufPos); |
//log("parseElement begin " + in.getPosition()); |
|
XmlElement element = new XmlElement(); |
|
if(!parseStartTag(element)) { |
//log("parseElement no start tag " + bufPos); |
//log("parseElement no start tag " + in.getPosition()); |
return null; |
} |
|
448,7 → 520,7 |
// throwException("tag names do not match"); |
} |
|
//log("parseElement ok " + bufPos); |
//log("parseElement ok " + in.getPosition()); |
return element; |
} |
|
455,12 → 527,12 |
private boolean parseStartTag(XmlElement element) |
throws XmlException, IOException |
{ |
//log("parseStartTag begin " + bufPos); |
int start = bufPos; |
//log("parseStartTag begin " + in.getPosition()); |
in.mark(); |
|
// begin |
if(!testChar('<', bufPos)) { |
//log("parseStartTag no signature " + bufPos); |
if(!testChar('<')) { |
//log("parseStartTag no signature " + in.getPosition()); |
return false; |
} |
|
467,8 → 539,8 |
// name |
XmlSelection sel = new XmlSelection(); |
if(!parseName(sel)) { |
//log("parseStartTag no name " + bufPos); |
bufPos = start; |
//log("parseStartTag no name " + in.getPosition()); |
in.reset(); |
return false; |
} |
//System.out.print("[" + toString(sel) + "]"); |
484,12 → 556,12 |
//System.out.println(); |
|
// end |
element.isEmpty = testChar('/', bufPos); |
if(!testChar('>', bufPos)) { |
element.isEmpty = testChar('/'); |
if(!testChar('>')) { |
throwException("end of tag expected"); |
} |
|
//log("parseStartTag ok " + bufPos); |
//log("parseStartTag ok " + in.getPosition()); |
return true; |
} |
|
496,9 → 568,9 |
private void parseEndTag(XmlSelection sel) |
throws XmlException, IOException |
{ |
//log("parseEndTag begin " + bufPos); |
//log("parseEndTag begin " + in.getPosition()); |
// begin |
if(!testChar('<', bufPos) || !testChar('/', bufPos)) { |
if(!testChar('<') || !testChar('/')) { |
throwException("cannot find tag end"); |
} |
|
511,23 → 583,28 |
skipSpaces(); |
|
// end |
if(!testChar('>', bufPos)) { |
if(!testChar('>')) { |
throwException("end of tag expected"); |
} |
|
//log("parseEndTag ok " + bufPos); |
//log("parseEndTag ok " + in.getPosition()); |
} |
|
private boolean parseTagContent(XmlElement element) |
throws XmlException, IOException |
{ |
//log("parseTagContent begin " + bufPos); |
//log("parseTagContent begin " + in.getPosition()); |
XmlSelection sel = new XmlSelection(); |
for(;;) { |
if(bufPos+1 >= bufLen && !ensureNext(2)) { |
if(in.eof()) { |
throwException("unexpected EoF"); |
} |
if(buf[bufPos] == '<' && buf[bufPos+1] == '/') break; |
in.mark(); |
if(testChar('<') && testChar('/')) { |
in.reset(); |
break; |
} |
in.reset(); |
|
if(parseElement() != null) { |
} |
543,7 → 620,7 |
// FIXME allow 'Reference' here |
} |
|
//log("parseTagContent ok " + bufPos); |
//log("parseTagContent ok " + in.getPosition()); |
return true; |
} |
|
551,17 → 628,17 |
private boolean parseCharData(XmlSelection sel) |
throws XmlException, IOException |
{ |
//log("parseCharData begin " + bufPos); |
//log("parseCharData begin " + in.getPosition()); |
boolean found = false; |
saveSelBegin(sel); |
|
for(;; ++bufPos) { |
if(bufPos >= bufLen && !ensureNext(1)) { |
for(;;) { |
int c = in.read(); |
if(c < 0) { |
return false; |
} |
|
byte c = buf[bufPos]; |
if(c == '<') { |
else if(c == '<') { |
in.back(); |
break; |
} |
|
568,7 → 645,7 |
found = true; |
} |
|
//log("parseCharData " + found + " " + bufPos); |
//log("parseCharData " + found + " " + in.getPosition()); |
saveSelEnd(sel); |
return found; |
} |
576,31 → 653,27 |
private boolean parseCData(XmlSelection sel) |
throws XmlException, IOException |
{ |
if(bufPos+8 >= bufLen && !ensureNext(9)) { |
return false; |
} |
if(buf[bufPos] != '<' || buf[bufPos+1] != '!' |
|| buf[bufPos+2] != '[' || buf[bufPos+3] != 'C' |
|| buf[bufPos+4] != 'D' || buf[bufPos+5] != 'A' |
|| buf[bufPos+6] != 'T' || buf[bufPos+7] != 'A' |
|| buf[bufPos+8] != '[') |
in.mark(); |
if(!testChar('<') || !testChar('!') |
|| !testChar('[') || !testChar('C') |
|| !testChar('D') || !testChar('A') |
|| !testChar('T') || !testChar('A') |
|| !testChar('[')) |
{ |
in.reset(); |
return false; |
} |
|
bufPos += 9; |
saveSelBegin(sel); |
|
for(;; ++bufPos) { |
if(bufPos+2 >= bufLen && !ensureNext(3)) { |
throwException("unexpected EoF"); |
} |
|
if(buf[bufPos] == ']' && buf[bufPos+1] == ']' && buf[bufPos+2] == '>') { |
for(; !in.eof(); in.read()) { |
if(testChar(']') && testChar(']') && testChar('>')) { |
saveSelEnd(sel); |
bufPos += 3; |
return true; |
} |
} |
|
throwException("unexpected EoF"); |
return false; |
} |
} |