0,0 → 1,714 |
/* |
* input 2072160977 bytes; AMD Athlon 3100+, disk speed 38.0 MB/s |
* 59.18s user 9.77s system 92% cpu 1:14.72 total; RES 9420K (empty java - 8200K) |
*/ |
import java.io.*; |
|
public class XmlParser |
{ |
public static void main(String[] args) |
throws Exception |
{ |
if(args.length < 1) { |
System.err.println("Need file name as parameter"); |
return; |
} |
|
long startTime = System.nanoTime(); |
|
FileInputStream in = new FileInputStream(args[0]); |
|
try { |
XmlListenerImpl listener = new XmlListenerImpl(); |
XmlDocument doc = new XmlDocument(listener); |
|
doc.parse(in); |
System.out.println(listener.count + " elements found"); |
} |
catch(XmlException ex) { |
System.out.println(ex); |
} |
finally { |
in.close(); |
} |
|
System.out.println("Elapsed: " + (System.nanoTime() - startTime) / 1000000 + "ms"); |
} |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlListenerImpl |
implements XmlListener |
{ |
public long count = 0; |
|
private XmlDocument document; |
|
public void init(XmlDocument document) |
throws XmlException, IOException |
{ |
this.document = document; |
} |
|
public void processElementBegin(XmlElement element) |
throws XmlException, IOException |
{ |
++count; |
} |
|
public void processElementEnd(XmlElement element) |
throws XmlException, IOException |
{ |
} |
|
public void processCharData(XmlSelection sel) |
throws XmlException, IOException |
{ |
} |
|
public void processCData(XmlSelection sel) |
throws XmlException, IOException |
{ |
} |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlException |
extends Exception |
{ |
static final long serialVersionUID = 1; |
|
private long line; |
private long linePos; |
|
public XmlException(String message, long line, long linePos) |
{ |
super(message); |
|
this.line = line; |
this.linePos = linePos; |
} |
|
public long getLine() { return line; } |
|
public long getLinePos() { return linePos; } |
|
public String toString() |
{ |
return "Error: " + getMessage() + " at " + line + ":" + linePos; |
} |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlSelection |
{ |
public long begin; |
public long end; |
|
public long getLength() { return end - begin; } |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
interface XmlListener |
{ |
public void init(XmlDocument document) |
throws XmlException, IOException; |
|
public void processElementBegin(XmlElement element) |
throws XmlException, IOException; |
|
public void processElementEnd(XmlElement element) |
throws XmlException, IOException; |
|
public void processCharData(XmlSelection sel) |
throws XmlException, IOException; |
|
public void processCData(XmlSelection sel) |
throws XmlException, IOException; |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlElement |
{ |
public boolean isEmpty = false; |
public XmlSelection elementSel; |
public XmlSelection nameSel; |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlBuffer |
{ |
private InputStream in; |
private byte[] buf = new byte[2048]; // (buf.length % 16 == 0) |
private int len; |
private int pos; |
private long offset; |
private int marked = -1; |
|
public XmlBuffer(InputStream in) |
{ |
if(buf.length % 16 != 0) { |
throw new RuntimeException("wrong buffer size: " + buf.length); |
} |
|
this.in = in; |
this.len = 0; |
this.pos = 0; |
this.offset = 0; |
} |
|
public byte cur() |
throws XmlException, IOException |
{ |
if(pos >= len) { |
if(!ensureNext(1)) return 0; |
} |
|
return buf[pos]; |
} |
|
public byte at(int n) |
throws XmlException, IOException |
{ |
if(pos + n >= len) { |
if(!ensureNext(n+1)) return 0; |
} |
|
return buf[pos + n]; |
} |
|
public void toNext() |
{ |
++pos; |
} |
|
public boolean isEnd() |
throws XmlException, IOException |
{ |
return (pos >= len && !ensureNext(1)); |
} |
|
public void skip(int n) |
{ |
pos += n; |
} |
|
public long getPosition() |
{ |
return (offset + pos); |
} |
|
public void mark() |
{ |
marked = pos; |
} |
|
public void reset() |
{ |
if(marked < 0) { |
throw new RuntimeException("no position saved"); |
} |
else { |
pos = marked; |
marked = -1; |
} |
} |
|
public boolean ensureNext(int count) |
throws IOException |
{ |
if(pos + count >= len) { |
if(len == 0) { |
// read full buffer at begin |
len = in.read(buf); |
pos = 0; |
} |
else if(len < buf.length) { |
// we could not fill full buffer last time - no more data |
return false; |
} |
else { |
// move last 1/16 of data to begin, fill rest with new data |
System.arraycopy(buf, buf.length / 16 * 15, buf, 0, buf.length / 16); |
int read = in.read(buf, buf.length / 16, buf.length / 16 * 15); |
len = buf.length / 16 + read; |
pos -= buf.length / 16 * 15; |
if(marked >= 0) { |
marked -= buf.length / 16 * 15; |
if(marked < 0) marked = -1; |
} |
offset += buf.length / 16 * 15; |
} |
|
return (pos + count < len); |
} |
else { |
return true; |
} |
} |
|
public String toString(long begin, int length) |
throws IOException |
{ |
return new String(buf, (int)(begin - offset), length, "UTF-8"); |
} |
} |
|
// -------------------------------------------------------------------------------------------------------------------- |
class XmlDocument |
{ |
private XmlListener listener; |
private XmlBuffer buf; |
private int level; |
|
public XmlDocument(XmlListener listener) |
{ |
this.listener = listener; |
} |
|
public void parse(InputStream in) |
throws XmlException, IOException |
{ |
this.buf = new XmlBuffer(in); |
this.level = 0; |
|
if(listener != null) listener.init(this); |
|
if(parseProlog()) { |
parseElement(); |
|
while(parseMisc()); |
|
if(!buf.isEnd()) throwException("EoF expected"); |
} |
} |
|
private void throwException(String message) |
throws XmlException, IOException |
{ |
throw new XmlException(message, 1, buf.getPosition()); |
} |
|
private void log(String message) |
{ |
System.out.println(message); |
} |
|
public String selectionToString(XmlSelection sel) |
throws IOException |
{ |
return buf.toString(sel.begin, (int)sel.getLength()); |
} |
|
private void saveSelBegin(XmlSelection sel) |
{ |
sel.begin = buf.getPosition(); |
} |
|
private void saveSelEnd(XmlSelection sel) |
{ |
sel.end = buf.getPosition(); |
} |
|
private boolean parseProlog() |
throws XmlException, IOException |
{ |
parseDecl(); |
while(parseMisc()); |
if(parseDoctype()) { |
while(parseMisc()); |
} |
|
return true; |
} |
|
private boolean skipSpaces() |
throws XmlException, IOException |
{ |
boolean found = false; |
|
for(;;) { |
byte c = buf.cur(); |
if(c == 0) { |
break; |
} |
else if(c == ' ' || c == '\t' || c == '\n' || c == '\r') { |
found = true; |
buf.toNext(); |
} |
else { |
break; |
} |
} |
|
return found; |
} |
|
private boolean parseDecl() |
throws XmlException, IOException |
{ |
// begin |
if(!testChar('<') || !testChar('?') || !testChar('x') |
|| !testChar('m') || !testChar('l')) |
{ |
return false; |
} |
|
// attributes |
XmlSelection selName = new XmlSelection(); |
XmlSelection selValue = new XmlSelection(); |
for(;;) { |
if(!skipSpaces()) break; |
if(!parseAttribute(selName, selValue)) break; |
} |
|
// end |
if(!testChar('?') || !testChar('>')) { |
throwException("end of XML declaration expected"); |
} |
|
return true; |
} |
|
private boolean parseDoctype() |
throws XmlException, IOException |
{ |
// FIXME not implemented |
|
return true; |
} |
|
private boolean parseName(XmlSelection sel) |
throws XmlException, IOException |
{ |
saveSelBegin(sel); |
|
byte c = buf.cur(); |
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || c == '_' || c == ':') { |
buf.toNext(); |
} |
else { |
return false; |
} |
|
for(;;) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
c = buf.cur(); |
if(('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || ('0' <= c && c <= '9') |
|| c == '.' || c == '-' || c == '_' || c == ':') |
{ |
buf.toNext(); |
} |
else { |
break; |
} |
} |
saveSelEnd(sel); |
|
return true; |
} |
|
private boolean testChar(char c) |
throws XmlException, IOException |
{ |
if(buf.cur() != c) { |
return false; |
} |
else { |
buf.toNext(); |
return true; |
} |
} |
|
private boolean parseAttribute(XmlSelection selName, XmlSelection selValue) |
throws XmlException, IOException |
{ |
// name |
buf.mark(); |
if(!parseName(selName)) { |
buf.reset(); |
return false; |
} |
|
// eq |
skipSpaces(); |
if(!testChar('=')) { |
throwException("equal sign expected"); |
} |
skipSpaces(); |
|
// FIXME allow 'Reference' here |
|
// value |
if(!testChar('"')) { |
throwException("quoted string expected"); |
} |
|
saveSelBegin(selValue); |
for(;; buf.toNext()) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
byte c = buf.cur(); |
if(c == '<' || c == '&' || c == '"') { |
break; |
} |
} |
saveSelEnd(selValue); |
|
if(!testChar('"')) { |
throwException("end of quoted string expected"); |
} |
|
// FIXME check '[WFC: No External Entity References]' |
|
return true; |
} |
|
private boolean parseMisc() |
throws XmlException, IOException |
{ |
if(parseComment()) { |
return true; |
} |
else if(parseProcessInstruction()) { |
return true; |
} |
else if(skipSpaces()) { |
return true; |
} |
else { |
return false; |
} |
} |
|
private boolean parseComment() |
throws XmlException, IOException |
{ |
if(buf.at(0) != '<' || buf.at(1) != '!' || buf.at(2) != '-' || buf.at(3) != '-') { |
return false; |
} |
|
buf.skip(4); |
for(;; buf.toNext()) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
if(buf.at(0) == '-' && buf.at(1) == '-') { |
if(buf.at(2) == '>') { |
buf.skip(3); |
|
return true; |
} |
else { |
throwException("Sequence '--' is not allowed in comment"); |
} |
} |
} |
} |
|
private boolean parseProcessInstruction() |
throws XmlException, IOException |
{ |
if(buf.at(0) != '<' || buf.at(1) != '?') { |
return false; |
} |
|
buf.skip(2); |
for(;; buf.toNext()) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
if(buf.at(0) == '?' && buf.at(1) == '>') { |
buf.skip(2); |
|
return true; |
} |
} |
} |
|
private XmlElement parseElement() |
throws XmlException, IOException |
{ |
XmlElement element = new XmlElement(); |
element.elementSel = new XmlSelection(); |
saveSelBegin(element.elementSel); |
|
if(!parseStartTag(element)) { |
return null; |
} |
|
++level; |
|
if(listener != null) listener.processElementBegin(element); |
|
if(!element.isEmpty) { |
if(!parseTagContent(element)) { |
throwException("cannot parse tag content"); |
} |
|
XmlSelection selEndName = new XmlSelection(); |
parseEndTag(selEndName); |
|
//if(!selectionToString(element.nameSel).equals(selectionToString(selEndName))) { |
// throwException("tag names do not match"); |
//} |
} |
|
saveSelEnd(element.elementSel); |
if(listener != null) listener.processElementEnd(element); |
|
--level; |
|
return element; |
} |
|
private boolean parseStartTag(XmlElement element) |
throws XmlException, IOException |
{ |
// begin |
if(!testChar('<')) { |
return false; |
} |
|
// name |
buf.mark(); |
element.nameSel = new XmlSelection(); |
if(!parseName(element.nameSel)) { |
buf.reset(); |
|
return false; |
} |
|
// attributes |
XmlSelection selName = new XmlSelection(); |
XmlSelection selValue = new XmlSelection(); |
for(;;) { |
if(!skipSpaces()) break; |
if(!parseAttribute(selName, selValue)) break; |
} |
|
// end |
element.isEmpty = testChar('/'); |
if(!testChar('>')) { |
throwException("end of tag expected"); |
} |
|
return true; |
} |
|
private void parseEndTag(XmlSelection sel) |
throws XmlException, IOException |
{ |
// begin |
if(!testChar('<') || !testChar('/')) { |
throwException("cannot find tag end"); |
} |
|
// name |
if(!parseName(sel)) { |
throwException("tag name expected"); |
} |
|
// spaces |
skipSpaces(); |
|
// end |
if(!testChar('>')) { |
throwException("end of tag expected"); |
} |
} |
|
private boolean parseTagContent(XmlElement element) |
throws XmlException, IOException |
{ |
XmlSelection sel = new XmlSelection(); |
for(;;) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
if(buf.at(0) == '<' && buf.at(1) == '/') break; |
|
if(parseElement() != null) { |
} |
else if(parseComment()) { |
} |
else if(parseCData(sel)) { |
} |
else if(parseProcessInstruction()) { |
} |
else if(parseCharData(sel)) { |
} |
else { |
throwException("unexpected tag content"); |
} |
|
// FIXME allow 'Reference' here |
} |
|
return true; |
} |
|
private boolean parseCharData(XmlSelection sel) |
throws XmlException, IOException |
{ |
// FIXME allow 'Reference' here |
|
boolean found = false; |
saveSelBegin(sel); |
|
for(;; buf.toNext()) { |
byte c = buf.cur(); |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
if(c == '<' || c == '&') { |
break; |
} |
|
found = true; |
} |
|
saveSelEnd(sel); |
if(listener != null) listener.processCharData(sel); |
|
return found; |
} |
|
private boolean parseCData(XmlSelection sel) |
throws XmlException, IOException |
{ |
if(buf.at(0) != '<' || buf.at(1) != '!' |
|| buf.at(2) != '[' || buf.at(3) != 'C' |
|| buf.at(4) != 'D' || buf.at(5) != 'A' |
|| buf.at(6) != 'T' || buf.at(7) != 'A' |
|| buf.at(8) != '[') |
{ |
return false; |
} |
|
buf.skip(9); |
saveSelBegin(sel); |
|
for(;; buf.toNext()) { |
if(buf.isEnd()) { |
throwException("unexpected EoF"); |
} |
|
if(buf.at(0) == ']' && buf.at(1) == ']' && buf.at(2) == '>') { |
saveSelEnd(sel); |
buf.skip(3); |
if(listener != null) listener.processCData(sel); |
|
return true; |
} |
} |
} |
} |