0001 package de.java2html.javasource;
0002
0003 import java.io.BufferedReader;
0004 import java.io.File;
0005 import java.io.FileReader;
0006 import java.io.IOException;
0007 import java.io.InputStream;
0008 import java.io.InputStreamReader;
0009 import java.io.Reader;
0010 import java.io.StringReader;
0011 import java.net.URL;
0012 import java.util.StringTokenizer;
0013
0014 import de.java2html.options.JavaSourceConversionOptions;
0015 import de.java2html.util.Ensure;
0016 import de.java2html.util.IoUtilities;
0017
0018 /**
0019 * Parses raw text to a {@link de.java2html.javasource.TypedSource} object. The
0020 * parser can not only handle grammatically correct Java source files but also
0021 * code snippets.
0022 *
0023 * <p>
0024 * (Parsing is done in multiple steps starting with raw text where every
0025 * character is classified as UNDEFINED and trying to find out more about it
0026 * step by step. There are some state machines used for parsing. They are hand
0027 * coded and quite complicated. The parser seems to be very stable, as I have
0028 * not been reported a single bug now for about two years.)
0029 *
0030 * <p>
0031 * For questions, suggestions, bug-reports, enhancement-requests etc. I may be
0032 * contacted at: <a href="mailto:markus@jave.de">markus@jave.de</a>
0033 *
0034 * The Java2html home page is located at: <a href="http://www.java2html.de">
0035 * http://www.java2html.de</a>
0036 *
0037 * @author <a href="mailto:markus@jave.de">Markus Gebhard</a>
0038 *
0039 * <code>Copyright (C) Markus Gebhard 2000-2003
0040 *
0041 * This program is free software; you can redistribute it and/or
0042 * * modify it under the terms of the GNU General Public License
0043 * * as published by the Free Software Foundation; either version 2
0044 * * of the License, or (at your option) any later version.
0045 *
0046 * This program is distributed in the hope that it will be useful,
0047 * * but WITHOUT ANY WARRANTY; without even the implied warranty of
0048 * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0049 * * GNU General Public License for more details.
0050 *
0051 * You should have received a copy of the GNU General Public License
0052 * * along with this program; if not, write to the Free Software
0053 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.</code>
0054 */
0055 public class JavaSourceParser {
0056
0057 private final JavaSourceConversionOptions options;
0058
0059 /** Delimiters for numeric values. */
0060 private final static String NUM_DELIMITERS = " \t\n\r()[]{};:+-/\\*!?#%&|<>=^,";
0061
0062 /** Delimiters for finding data types and keywords. */
0063 private final static String DELIMITERS = " \t\n\r()[]{};:.+-/\\*!?#%&|<>=^";
0064
0065 /** Characters automatically classified as being empty (type==BACKGROUND) */
0066 private final static String EMPTY_STR = " \t\n\r\f";
0067
0068 private final static String[] PRIMITIVE_DATATYPES = {
0069 "boolean",
0070 "byte",
0071 "char",
0072 "double",
0073 "float",
0074 "int",
0075 "long",
0076 "short",
0077 "void" };
0078
0079 /** Counter for this and that (parseThree()?) */
0080 private int counter;
0081
0082 /** EOT=End of text */
0083 private final static char EOT = (char) -1;
0084
0085 /* State informations for state machine one */
0086 private ParseState parseState;
0087 private int parseSourcePos;
0088 private int parseTypePos;
0089
0090 public JavaSourceParser() {
0091 this(JavaSourceConversionOptions.getDefault());
0092 }
0093
0094 public JavaSourceParser(JavaSourceConversionOptions options) {
0095 Ensure.ensureArgumentNotNull(options);
0096 this.options = options;
0097 }
0098
0099 private final static boolean isEmpty(char ch) {
0100 return (EMPTY_STR.indexOf(ch) != -1);
0101 }
0102
0103 private boolean isNumberDelimiter(char ch) {
0104 return (NUM_DELIMITERS.indexOf(ch) != -1);
0105 }
0106
0107 private final static int indexOf(char ch, String s, int start, int end) {
0108 if (end < start) {
0109 return -1;
0110 }
0111
0112 for (int i = start; i <= end; ++i) {
0113 if (s.charAt(i) == ch) {
0114 return i;
0115 }
0116 }
0117
0118 return -1;
0119 }
0120
0121 public TypedSource parse(File file) throws IOException {
0122 final TypedSource source = parse(new FileReader(file));
0123 source.setFileName(file.getName());
0124 return source;
0125 }
0126
0127 public TypedSource parse(String rawText) {
0128 if (rawText == null) {
0129 throw new NullPointerException();
0130 }
0131 try {
0132 return parse(new StringReader(rawText));
0133 }
0134 catch (final IOException e) {
0135 throw new RuntimeException("Unexpected exception while parsing plain text from a string ", e); //$NON-NLS-1$
0136 }
0137 }
0138
0139 public TypedSource parse(URL url) throws IOException {
0140 InputStream inputStream = null;
0141 try {
0142 inputStream = url.openStream();
0143 return parse(inputStream);
0144 }
0145 finally {
0146 IoUtilities.close(inputStream);
0147 }
0148 }
0149
0150 public TypedSource parse(InputStream stream) throws IOException {
0151 return parse(new InputStreamReader(stream));
0152 }
0153
0154 public TypedSource parse(Reader reader) throws IOException {
0155 if (reader == null) {
0156 throw new IllegalArgumentException("reader may not be null"); //$NON-NLS-1$
0157 }
0158 final String rawSourceCode;
0159 try {
0160 rawSourceCode = readPlainSource(reader);
0161 }
0162 finally {
0163 IoUtilities.close(reader);
0164 }
0165 final String sourceCode = replaceTabs(rawSourceCode);
0166
0167 final SourceParseObject parseObject = new SourceParseObject(sourceCode);
0168
0169 parseOne(parseObject);
0170 parseTwo(parseObject);
0171 parseThree(parseObject);
0172 parseFour(parseObject);
0173 doStatistics(parseObject);
0174
0175 return parseObject.createTypedSource();
0176 }
0177
0178 private void parseFour(SourceParseObject parseObject) {
0179 boolean isInsideAnnotation = false;
0180 for (int i = 0; i < parseObject.getCharacterCount(); ++i) {
0181 if (!isInsideAnnotation
0182 && parseObject.getSourceType(i) == SourceType.CODE
0183 && parseObject.getSourceCharAt(i) == '@') {
0184 isInsideAnnotation = true;
0185 parseObject.setType(i, SourceType.ANNOTATION);
0186 }
0187 else if (isInsideAnnotation
0188 && parseObject.getSourceType(i) == SourceType.CODE
0189 && (Character.isJavaIdentifierPart(parseObject.getSourceCharAt(i)) || parseObject.getSourceCharAt(i) == '.')) {
0190 parseObject.setType(i, SourceType.ANNOTATION);
0191 }
0192 else {
0193 isInsideAnnotation = false;
0194 }
0195 }
0196 }
0197
0198 /**
0199 * Gathers statistical information from the source code. After parsing this
0200 * is quite easy and maybe it is useful for others. lineCount is needed for
0201 * the html converter.
0202 */
0203 private void doStatistics(SourceParseObject parseObject) {
0204 int index = 0;
0205 final String sourceCode = parseObject.getSourceCode();
0206 final JavaSourceStatistic statistics = parseObject.getStatistics();
0207 statistics.setCharacterCount(sourceCode.length());
0208 int linesContainingAnything = 0;
0209
0210 if (sourceCode.length() == 0) {
0211 statistics.setLineCount(0);
0212 }
0213 else {
0214 final StringTokenizer st = new StringTokenizer(sourceCode, "\n\r", true);
0215 while (st.hasMoreTokens()) {
0216 final String line = st.nextToken();
0217
0218 if (line.charAt(0) == '\r') {
0219 ++index;
0220 }
0221 else if (line.charAt(0) == '\n') {
0222 ++index;
0223 statistics.setLineCount(statistics.getLineCount() + 1);
0224 }
0225 else {
0226 ++linesContainingAnything;
0227 statistics(parseObject.getSourceTypes(), statistics, line, index);
0228 index += line.length();
0229 }
0230 }
0231 statistics.setLineCount(statistics.getLineCount() + 1);
0232 }
0233
0234 //some empty lines without any were not counted
0235 statistics.setEmptyLineCount(statistics.getLineCount() - linesContainingAnything);
0236 }
0237
0238 private void statistics(SourceType[] sourceTypes, JavaSourceStatistic statistics, String line, int start) {
0239 if (line.length() > statistics.getMaxLineLength()) {
0240 statistics.setMaxLineLength(line.length());
0241 }
0242
0243 final int end = start + line.length();
0244
0245 boolean containsCode = false;
0246 boolean containsComment = false;
0247
0248 for (int i = start; i < end; ++i) {
0249 if (sourceTypes[i] == SourceType.CODE
0250 || sourceTypes[i] == SourceType.KEYWORD
0251 || sourceTypes[i] == SourceType.CODE_TYPE
0252 || sourceTypes[i] == SourceType.CHAR_CONSTANT
0253 || sourceTypes[i] == SourceType.NUM_CONSTANT) {
0254 containsCode = true;
0255 if (containsComment) {
0256 break;
0257 }
0258 }
0259 else if (sourceTypes[i] == SourceType.COMMENT_BLOCK
0260 || sourceTypes[i] == SourceType.COMMENT_LINE
0261 || sourceTypes[i] == SourceType.JAVADOC
0262 || sourceTypes[i] == SourceType.JAVADOC_KEYWORD) {
0263 containsComment = true;
0264 if (containsCode) {
0265 break;
0266 }
0267 }
0268 }
0269
0270 if (containsCode) {
0271 statistics.setCodeLineCount(statistics.getCodeLineCount() + 1);
0272 }
0273 if (containsComment) {
0274 statistics.setCommentLineCount(statistics.getCommentLineCount() + 1);
0275 }
0276 if (!containsCode && !containsComment) {
0277 statistics.setEmptyLineCount(statistics.getEmptyLineCount() + 1);
0278 }
0279 }
0280
0281 private String readPlainSource(Reader reader) throws IOException {
0282 return readPlainSource(new BufferedReader(reader));
0283 }
0284
0285 private String readPlainSource(BufferedReader reader) throws IOException {
0286
0287 final StringBuffer sb = new StringBuffer();
0288 String line;
0289 while ((line = reader.readLine()) != null) {
0290 sb.append(line);
0291 sb.append("\r\n");
0292 }
0293 if (sb.length() > 0) {
0294 sb.setLength(sb.length() - 2);
0295 }
0296 return sb.toString();
0297 // while (true){
0298 // char[] buffer = new char[256];
0299 // int length = reader.read(buffer, 0 , 256);
0300 // if (length<=0){
0301 // break;
0302 // }
0303 // sb.append(buffer, 0, length);
0304 // }
0305 // //Newlines are converted to "\r\n" for compatibility with eclipse
0306 // styledtext!!!
0307 // return NewLineNormalizer.normalize(sb.toString(), "\r\n");
0308 }
0309
0310 /**
0311 * Preprocessing: Replaces all tabs (\t) by 'tabs' space characters.
0312 */
0313 private String replaceTabs(String sourceCode) {
0314 final char[] t = new char[options.getTabSize()];
0315 for (int i = 0; i < options.getTabSize(); ++i) {
0316 t[i] = ' ';
0317 }
0318
0319 final StringBuffer sb = new StringBuffer((int) (sourceCode.length() * 1.3));
0320 for (int i = 0; i < sourceCode.length(); ++i) {
0321 final char ch = sourceCode.charAt(i);
0322 if (ch == '\t') {
0323 sb.append(t);
0324 }
0325 else {
0326 sb.append(ch);
0327 }
0328 }
0329
0330 return sb.toString();
0331 }
0332
0333 /**
0334 * First step of parsing. All characters are classified 'UNDEFINED' and we
0335 * try to divide this into: CODE, CHAR_CONSTANT, COMMENT_LINE, COMMENT_BLOCK,
0336 * COMMENT_JAVADOC, BACKGROUND and QUOTE This is done by a quite complicate
0337 * state machine.
0338 */
0339 private void parseOne(SourceParseObject parseObject) {
0340 parseState = ParseState.COD;
0341 parseSourcePos = 0;
0342 parseTypePos = 0;
0343
0344 while (parseState != ParseState.FINISHED) {
0345 parseOneDo(parseObject);
0346 }
0347 }
0348
0349 /**
0350 * State-machine for classifying the code to: CODE, CHAR_CONSTANT,
0351 * COMMENT_LINE, COMMENT_BLOCK, COMMENT_JAVADOC, BACKGROUND and QUOTE
0352 *
0353 * Note: It works - don't ask me how! If you want to know more about it all
0354 * you can do is taking a sheet of paper (or more) and a pencil and try to
0355 * draw the state machine :-)
0356 */
0357 private void parseOneDo(SourceParseObject parseObject) {
0358 final char ch;
0359 if (parseObject.getCharacterCount() > parseSourcePos) {
0360 ch = parseObject.getSourceCharAt(parseSourcePos++);
0361 }
0362 else {
0363 ch = EOT;
0364 }
0365
0366 switch (parseState) {
0367 case COD:
0368 if (ch == EOT) {
0369 parseState = ParseState.FINISHED;
0370 return;
0371 }
0372 if (ch == '/') {
0373 parseState = ParseState.CODE_AWAIT_COMMENT;
0374 return;
0375 }
0376 if (ch == '"') {
0377 parseObject.setType(parseTypePos++, SourceType.STRING);
0378 parseState = ParseState.QU;
0379 return;
0380 }
0381 if (ch == '\'') {
0382 parseState = ParseState.CH1;
0383 return;
0384 }
0385 if (isEmpty(ch)) {
0386 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0387 return;
0388 }
0389 parseObject.setType(parseTypePos++, SourceType.CODE);
0390 return;
0391 case CODE_AWAIT_COMMENT:
0392 if (ch == EOT) {
0393 parseState = ParseState.FINISHED;
0394 parseObject.setType(parseTypePos++, SourceType.CODE);
0395 return;
0396 }
0397 if (ch == '/') {
0398 parseObject.setType(parseTypePos++, SourceType.COMMENT_LINE);
0399 parseObject.setType(parseTypePos++, SourceType.COMMENT_LINE);
0400 parseState = ParseState.CL;
0401 return;
0402 }
0403 if (ch == '*') {
0404 parseState = ParseState.CBJ1;
0405 return;
0406 }
0407 if (isEmpty(ch)) {
0408 parseObject.setType(parseTypePos++, SourceType.CODE);
0409 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0410 parseState = ParseState.COD;
0411 return;
0412 }
0413
0414 parseObject.setType(parseTypePos++, SourceType.CODE);
0415 parseObject.setType(parseTypePos++, SourceType.CODE);
0416 parseState = ParseState.COD;
0417 return;
0418 case CL:
0419 if (ch == EOT) {
0420 parseState = ParseState.FINISHED;
0421 return;
0422 }
0423 if (ch == '\n' || ch == '\r') {
0424 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0425 //ggf. durch COMMENT_LINE ersetzen
0426 parseState = ParseState.COD;
0427 return;
0428 }
0429 if (isEmpty(ch)) {
0430 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0431 return;
0432 }
0433 parseObject.setType(parseTypePos++, SourceType.COMMENT_LINE);
0434 return;
0435 case CB:
0436 if (ch == EOT) {
0437 parseState = ParseState.FINISHED;
0438 return;
0439 }
0440 if (ch == '*') {
0441 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0442 parseState = ParseState.CBA;
0443 return;
0444 }
0445 if (isEmpty(ch)) {
0446 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0447 return;
0448 }
0449 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0450 return;
0451 case CBA:
0452 if (ch == EOT) {
0453 parseState = ParseState.FINISHED;
0454 return;
0455 }
0456 if (ch == '/') {
0457 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0458 parseState = ParseState.COD;
0459 return;
0460 }
0461 if (ch == '*') {
0462 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0463 parseState = ParseState.CBA;
0464 return;
0465 }
0466 if (isEmpty(ch)) {
0467 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0468 parseState = ParseState.CB;
0469 return;
0470 }
0471 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0472 parseState = ParseState.CB;
0473 return;
0474 case CJ:
0475 if (ch == EOT) {
0476 parseState = ParseState.FINISHED;
0477 return;
0478 }
0479 if (ch == '*') {
0480 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0481 parseState = ParseState.CJA;
0482 return;
0483 }
0484 if (isEmpty(ch)) {
0485 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0486 return;
0487 }
0488 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0489 return;
0490 case CJA:
0491 if (ch == EOT) {
0492 parseState = ParseState.FINISHED;
0493 return;
0494 }
0495 if (ch == '/') {
0496 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0497 parseState = ParseState.COD;
0498 return;
0499 }
0500 if (ch == '*') {
0501 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0502 parseState = ParseState.CJA;
0503 return;
0504 }
0505 if (isEmpty(ch)) {
0506 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0507 parseState = ParseState.CJ;
0508 return;
0509 }
0510 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0511 parseState = ParseState.CJ;
0512 return;
0513 case QU:
0514 if (ch == EOT) {
0515 parseState = ParseState.FINISHED;
0516 return;
0517 }
0518 if (ch == '"') {
0519 parseObject.setType(parseTypePos++, SourceType.STRING);
0520 parseState = ParseState.COD;
0521 return;
0522 }
0523 if (ch == '\\') {
0524 parseState = ParseState.QUA;
0525 return;
0526 }
0527 if (isEmpty(ch)) {
0528 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0529 return;
0530 }
0531
0532 parseObject.setType(parseTypePos++, SourceType.STRING);
0533 return;
0534 case QUA:
0535 if (ch == EOT) {
0536 parseObject.setType(parseTypePos++, SourceType.STRING);
0537 parseState = ParseState.FINISHED;
0538 return;
0539 }
0540 if (ch == '\\') {
0541 parseObject.setType(parseTypePos++, SourceType.STRING);
0542 parseObject.setType(parseTypePos++, SourceType.STRING);
0543 parseState = ParseState.QU; //This one has been changed from QUA to QU in 2.0
0544 return;
0545 }
0546 if (isEmpty(ch)) {
0547 parseObject.setType(parseTypePos++, SourceType.STRING);
0548 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0549 parseState = ParseState.QU;
0550 return;
0551 }
0552 parseObject.setType(parseTypePos++, SourceType.STRING);
0553 parseObject.setType(parseTypePos++, SourceType.STRING);
0554 parseState = ParseState.QU;
0555 return;
0556 case CBJ1:
0557 if (ch == EOT) {
0558 parseState = ParseState.FINISHED;
0559 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0560 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0561 return;
0562 }
0563 if (ch == '*') {
0564 parseState = ParseState.CBJ2;
0565 return;
0566 }
0567 if (isEmpty(ch)) {
0568 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0569 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0570 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0571 parseState = ParseState.CB;
0572 return;
0573 }
0574 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0575 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0576 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0577 parseState = ParseState.CB;
0578 return;
0579 case CBJ2:
0580 if (ch == EOT) {
0581 parseState = ParseState.FINISHED;
0582 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0583 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0584 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0585 return;
0586 }
0587 if (ch == '/') {
0588 parseState = ParseState.COD;
0589 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0590 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0591 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0592 parseObject.setType(parseTypePos++, SourceType.COMMENT_BLOCK);
0593 return;
0594 }
0595 if (isEmpty(ch)) {
0596 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0597 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0598 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0599 parseObject.setType(parseTypePos++, SourceType.BACKGROUND);
0600 parseState = ParseState.CJ;
0601 return;
0602 }
0603 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0604 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0605 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0606 parseObject.setType(parseTypePos++, SourceType.JAVADOC);
0607 parseState = ParseState.CJ;
0608 return;
0609 case CH1:
0610 if (ch == EOT) {
0611 parseObject.setType(parseTypePos++, SourceType.CODE);
0612 parseState = ParseState.FINISHED;
0613 return;
0614 }
0615 if (ch == '\\') {
0616 parseState = ParseState.CH3;
0617 return;
0618 }
0619 parseState = ParseState.CH2;
0620 return;
0621 case CH2:
0622 if (ch == EOT) {
0623 parseObject.setType(parseTypePos++, SourceType.CODE);
0624 parseObject.setType(parseTypePos++, SourceType.CODE);
0625 parseState = ParseState.FINISHED;
0626 return;
0627 }
0628 if (ch == '\'') {
0629 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0630 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0631 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0632 parseState = ParseState.COD;
0633 return;
0634 }
0635 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0636 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0637 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0638 parseState = ParseState.COD;
0639 return;
0640 case CH3:
0641 if (ch == EOT) {
0642 parseObject.setType(parseTypePos++, SourceType.CODE);
0643 parseObject.setType(parseTypePos++, SourceType.CODE);
0644 parseState = ParseState.FINISHED;
0645 return;
0646 }
0647 if (ch == 'u') {
0648 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0649 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0650 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0651 parseState = ParseState.CH5;
0652 return;
0653 }
0654 if (ch >= '1' && ch <= '9') {
0655 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0656 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0657 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0658 parseState = ParseState.CH6;
0659 return;
0660 }
0661 parseState = ParseState.CH4;
0662 return;
0663 case CH4:
0664 if (ch == EOT) {
0665 parseObject.setType(parseTypePos++, SourceType.CODE);
0666 parseObject.setType(parseTypePos++, SourceType.CODE);
0667 parseObject.setType(parseTypePos++, SourceType.CODE);
0668 parseState = ParseState.FINISHED;
0669 return;
0670 }
0671 if (ch == '\'') {
0672 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0673 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0674 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0675 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0676 parseState = ParseState.COD;
0677 return;
0678 }
0679 parseObject.setType(parseTypePos++, SourceType.CODE);
0680 parseObject.setType(parseTypePos++, SourceType.CODE);
0681 parseObject.setType(parseTypePos++, SourceType.CODE);
0682 parseObject.setType(parseTypePos++, SourceType.CODE);
0683 parseState = ParseState.COD;
0684 return;
0685 case CH6:
0686 if (ch == EOT) {
0687 parseState = ParseState.FINISHED;
0688 return;
0689 }
0690 if (ch == '\'') {
0691 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0692 parseState = ParseState.COD;
0693 return;
0694 }
0695 if (ch >= '0' && ch <= '9') {
0696 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0697 return;
0698 }
0699 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0700 parseState = ParseState.COD;
0701 return;
0702 case CH5:
0703 if (ch == EOT) {
0704 parseState = ParseState.FINISHED;
0705 return;
0706 }
0707 if (ch == '\'') {
0708 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0709 parseState = ParseState.COD;
0710 return;
0711 }
0712 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
0713 parseObject.setType(parseTypePos++, SourceType.CHAR_CONSTANT);
0714 return;
0715 }
0716 parseObject.setType(parseTypePos++, SourceType.UNDEFINED);
0717 parseState = ParseState.COD;
0718 return;
0719 }
0720 }
0721
0722 /**
0723 * Second step for parsing. The categories from the first step are further
0724 * divided: COMMENT_JAVADOC to COMMENT_JAVADOC and COMMENT_KEYWORD CODE to
0725 * CODE, CODE_TYPE and CODE_KEYWORD
0726 */
0727 private void parseTwo(SourceParseObject parseObject) {
0728 for (int index = 0; index < parseObject.getCharacterCount(); ++index) {
0729 if (parseObject.getSourceType(index) == SourceType.CODE) {
0730 if (isParenthesis(parseObject.getSourceCharAt(index))) {
0731 parseObject.setType(index, SourceType.PARENTHESIS);
0732 }
0733 }
0734 }
0735
0736 int start = 0;
0737 int end = 0;
0738
0739 while (end < parseObject.getCharacterCount() - 1) {
0740 while (end < parseObject.getCharacterCount() - 1
0741 && parseObject.getSourceType(end + 1) == parseObject.getSourceType(start)) {
0742 ++end;
0743 }
0744
0745 parseTwo(parseObject, start, end);
0746
0747 start = end + 1;
0748 end = start;
0749 }
0750 }
0751
0752 private static boolean isParenthesis(char ch) {
0753 return ch == '{' || ch == '}' || ch == '[' || ch == ']' || ch == '(' || ch == ')';
0754 }
0755
0756 private void parseTwo(SourceParseObject parseObject, int start, int end) {
0757 if (parseObject.getSourceType(start) == SourceType.JAVADOC) {
0758 parseTwoCommentBlock(parseObject, start, end);
0759 return;
0760 }
0761 else if (parseObject.getSourceType(start) == SourceType.CODE) {
0762 parseTwoCode(parseObject, start, end);
0763 return;
0764 }
0765 //Keine weitere Unterteilung möglich
0766 return;
0767 }
0768
0769 /**
0770 * Looks for primitive datatyes and keywords in the given region.
0771 */
0772 private void parseTwoCode(SourceParseObject parseObject, int start, int end) {
0773 final String code = parseObject.getSourceCode().substring(start, end + 1);
0774
0775 int index = start;
0776 final StringTokenizer st = new StringTokenizer(code, DELIMITERS, true);
0777 while (st.hasMoreTokens()) {
0778 final String s = st.nextToken();
0779 //Keyword?
0780 if (JavaKeywords.getInstance().isJavaKeyWord(s)) {
0781 parseObject.setType(index, index + s.length(), SourceType.KEYWORD);
0782 if (s.equals("package")) {
0783 final int i1 = parseObject.getSourceCode().indexOf(';', index + 1);
0784 if (i1 != -1) {
0785 parseObject.getStatistics().setPackageName(
0786 parseObject.getSourceCode().substring(index + s.length(), i1).trim());
0787 }
0788 }
0789 }
0790 else {
0791 //Datatype?
0792 for (int i = 0; i < PRIMITIVE_DATATYPES.length; ++i) {
0793 if (s.equals(PRIMITIVE_DATATYPES[i])) {
0794 parseObject.setType(index, index + s.length(), SourceType.CODE_TYPE);
0795 break;
0796 }
0797 }
0798 }
0799 index += s.length();
0800 }
0801 }
0802
0803 /**
0804 * Tries to find JavaDoc comment keywords and html tags @l
0805 */
0806 private void parseTwoCommentBlock(SourceParseObject parseObject, int start, int end) {
0807 int i1 = indexOf('@', parseObject.getSourceCode(), start, end);
0808
0809 while (i1 != -1 && i1 + 1 < end) {
0810 int i2 = i1 + 1;
0811
0812 char ch = parseObject.getSourceCharAt(i2 + 1);
0813 while (i2 < end && ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'))) {
0814 ch = parseObject.getSourceCharAt(++i2 + 1);
0815 }
0816
0817 final String s = parseObject.getSourceCode().substring(i1, i2 + 1);
0818 //s is likely to be a valid JavaDoc-Tag
0819
0820 // if ((s.equals("@link") || s.equals("@linkplain"))
0821 // && sourceCode.charAt(i1 - 1) == '{'
0822 // && start > 0) {
0823 // mark(i1 - 1, i1 + 5, JavaSourceType.JAVADOC_LINKS);
0824 // }
0825 // else
0826 if (JavaKeywords.getInstance().isJavaDocKeyword(s)) {
0827 parseObject.setType(i1, i2 + 1, SourceType.JAVADOC_KEYWORD);
0828 }
0829
0830 i1 = indexOf('@', parseObject.getSourceCode(), i2, end);
0831 }
0832
0833 //find html tags
0834 i1 = indexOf('<', parseObject.getSourceCode(), start, end);
0835 while (i1 != -1 && i1 + 1 < end) {
0836 final int i2 = parseObject.getSourceCode().indexOf('>', i1 + 1);
0837
0838 // char ch=sourceCode.charAt(i2+1);
0839 // while(i2<end && ch!='>'){
0840 // ch=sourceCode.charAt(++i2+1);
0841 // }
0842 if (i2 == -1) {
0843 i1 = -1;
0844 break;
0845 }
0846 if (parseObject.hasTypeOrEmpty(i1, i2 + 1, SourceType.JAVADOC)) {
0847 parseObject.setType(i1, i2 + 1, SourceType.JAVADOC_HTML_TAG);
0848 }
0849 i1 = indexOf('<', parseObject.getSourceCode(), i2, end);
0850 }
0851 }
0852
0853 /**
0854 * Third step for parsing: Finding number constants. CODE is further divided
0855 * to CODE and NUM_CONSTANT
0856 */
0857 private void parseThree(SourceParseObject parseObject) {
0858 int start = 0;
0859 int end = 0;
0860
0861 while (end < parseObject.getCharacterCount() - 1) {
0862 while (end < parseObject.getCharacterCount() - 1
0863 && parseObject.getSourceType(end + 1) == parseObject.getSourceType(start)) {
0864 ++end;
0865 }
0866
0867 if (parseObject.getSourceType(start) == SourceType.CODE) {
0868 parseThree(parseObject, start, end);
0869 }
0870
0871 start = end + 1;
0872 end = start;
0873 }
0874
0875 expandJavaDocLinks(parseObject);
0876 }
0877
0878 private void expandJavaDocLinks(SourceParseObject parseObject) {
0879 expandEmbracedJavaDocTag(parseObject, "@link", SourceType.JAVADOC_LINKS);
0880 expandEmbracedJavaDocTag(parseObject, "@linkplain", SourceType.JAVADOC_LINKS);
0881 }
0882
0883 private void expandEmbracedJavaDocTag(SourceParseObject parseObject, String tag, SourceType type) {
0884 final String pattern = "{" + tag;
0885
0886 final String sourceCode = parseObject.getSourceCode();
0887 for (int index = 0; index < parseObject.getCharacterCount(); ++index) {
0888 final int start = sourceCode.indexOf(pattern, index);
0889 if (start == -1) {
0890 break;
0891 }
0892
0893 final char ch = sourceCode.charAt(start + pattern.length());
0894 if (Character.isLetterOrDigit(ch)) {
0895 break;
0896 }
0897
0898 if (!parseObject.checkRegion(start + 1, start + 1 + tag.length() - 1, new ISourceTypeChecker() {
0899 public boolean isValid(SourceType type) {
0900 return type.equals(SourceType.JAVADOC_KEYWORD);
0901 }
0902 })) {
0903 break;
0904 }
0905
0906 final int end = sourceCode.indexOf('}', start + pattern.length());
0907 if (end == -1) {
0908 break;
0909 }
0910
0911 //Check region, can only be JavaDoc and Background
0912 if (parseObject.checkRegion(start + 1 + tag.length(), end, new ISourceTypeChecker() {
0913 public boolean isValid(SourceType type) {
0914 return type.equals(SourceType.BACKGROUND) || type.equals(SourceType.JAVADOC);
0915 }
0916 })) {
0917 markWithoutBackground(parseObject, start, end, type);
0918 }
0919 index = end;
0920 }
0921
0922 }
0923
0924 private void markWithoutBackground(SourceParseObject parseObject, int start, int end, SourceType type) {
0925 for (int i = start; i <= end; ++i) {
0926 if (!parseObject.getSourceType(i).equals(SourceType.BACKGROUND)) {
0927 parseObject.setType(i, type);
0928 }
0929 }
0930 }
0931
0932 /**
0933 * Looks for number constants (NUM_CONSTANT) in the selected region.
0934 */
0935 private void parseThree(SourceParseObject parseObject, int start, int end) {
0936 parseState = ParseState.START;
0937 parseSourcePos = start;
0938 parseTypePos = start - 1;
0939 counter = 0;
0940
0941 while (parseState != ParseState.FINISHED) {
0942 parseThreeDo(parseObject, end);
0943 }
0944 }
0945
0946 /**
0947 * State-machine for NUM_CONSTANTs
0948 */
0949 private void parseThreeDo(SourceParseObject parseObject, int end) {
0950 final char ch;
0951 if (parseSourcePos <= end) {
0952 ch = parseObject.getSourceCharAt(parseSourcePos);
0953 }
0954 else {
0955 ch = EOT;
0956 }
0957
0958 ++parseSourcePos;
0959 ++parseTypePos;
0960
0961 switch (parseState) {
0962 case START:
0963 if (ch == EOT) {
0964 parseState = ParseState.FINISHED;
0965 return;
0966 }
0967 if (ch == '.') {
0968 ++counter;
0969 parseState = ParseState.PARSESTATE_DA;
0970 return;
0971 }
0972 if (ch == '0') {
0973 ++counter;
0974 parseState = ParseState.ZERO_EXPECTING_HEX;
0975 return;
0976 }
0977 if (ch >= '1' && ch <= '9') {
0978 ++counter;
0979 parseState = ParseState.NUMERIC_CONSTANT;
0980 return;
0981 }
0982 if (isNumberDelimiter(ch)) {
0983 //stay in this parse state
0984 return;
0985 }
0986 parseState = ParseState.NEUTRAL;
0987 return;
0988 case NEUTRAL:
0989 if (ch == EOT) {
0990 parseState = ParseState.FINISHED;
0991 return;
0992 }
0993 if (isNumberDelimiter(ch)) {
0994 parseState = ParseState.START;
0995 return;
0996 }
0997 return;
0998 case PARSESTATE_DA:
0999 if (ch == EOT) {
1000 parseState = ParseState.FINISHED;
1001 return;
1002 }
1003 if (ch >= '0' && ch <= '9') {
1004 ++counter;
1005 parseState = ParseState.NUMERIC_CONSTANT;
1006 return;
1007 }
1008 if (isNumberDelimiter(ch)) {
1009 parseState = ParseState.START;
1010 counter = 0;
1011 return;
1012 }
1013 parseState = ParseState.NEUTRAL;
1014 counter = 0;
1015 return;
1016 case NUMERIC_CONSTANT:
1017 if (ch == EOT) {
1018 parseState = ParseState.FINISHED;
1019 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1020 return;
1021 }
1022 if (ch == '.' || (ch >= '0' && ch <= '9')) {
1023 ++counter;
1024 return;
1025 }
1026 if (ch == 'e') {
1027 parseState = ParseState.EXPONENT;
1028 ++counter;
1029 return;
1030 }
1031 if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D' || ch == 'l' || ch == 'L') {
1032 ++counter;
1033 parseObject.setType(parseTypePos - counter + 1, parseTypePos + 1, SourceType.NUM_CONSTANT);
1034 parseState = ParseState.NEUTRAL;
1035 counter = 0;
1036 return;
1037 }
1038 if (isNumberDelimiter(ch)) {
1039 parseState = ParseState.START;
1040 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1041 counter = 0;
1042 return;
1043 }
1044 parseObject.setType(parseTypePos - counter, SourceType.NUM_CONSTANT);
1045 parseState = ParseState.NEUTRAL;
1046 counter = 0;
1047 return;
1048 case ZERO_EXPECTING_HEX:
1049 if (ch == EOT) {
1050 parseState = ParseState.FINISHED;
1051 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1052 return;
1053 }
1054 if (ch == 'x' || ch == 'X') {
1055 parseState = ParseState.HEXADECIMAL_CONSTANT;
1056 ++counter;
1057 return;
1058 }
1059 if (ch == '.' || (ch >= '0' && ch <= '9')) {
1060 ++counter;
1061 parseState = ParseState.NUMERIC_CONSTANT;
1062 return;
1063 }
1064 if (ch == 'f' || ch == 'F' || ch == 'd' || ch == 'D' || ch == 'l' || ch == 'L') {
1065 ++counter;
1066 parseObject.setType(parseTypePos - counter + 1, parseTypePos + 1, SourceType.NUM_CONSTANT);
1067 parseState = ParseState.NEUTRAL;
1068 counter = 0;
1069 return;
1070 }
1071 if (isNumberDelimiter(ch)) {
1072 parseState = ParseState.START;
1073 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1074 counter = 0;
1075 return;
1076 }
1077 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1078 parseState = ParseState.NEUTRAL;
1079 counter = 0;
1080 return;
1081 case HEXADECIMAL_CONSTANT:
1082 if (ch == EOT) {
1083 parseState = ParseState.FINISHED;
1084 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1085 return;
1086 }
1087 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) {
1088 ++counter;
1089 parseState = ParseState.HEXADECIMAL_CONSTANT;
1090 return;
1091 }
1092 if (ch == 'l' || ch == 'L') {
1093 ++counter;
1094 parseObject.setType(parseTypePos - counter + 1, parseTypePos + 1, SourceType.NUM_CONSTANT);
1095 parseState = ParseState.NEUTRAL;
1096 counter = 0;
1097 return;
1098 }
1099 if (isNumberDelimiter(ch)) {
1100 parseState = ParseState.START;
1101 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1102 counter = 0;
1103 return;
1104 }
1105 parseObject.setType(parseTypePos - counter, parseTypePos, SourceType.NUM_CONSTANT);
1106 parseState = ParseState.NEUTRAL;
1107 counter = 0;
1108 return;
1109 case EXPONENT:
1110 if (ch == EOT) {
1111 parseState = ParseState.FINISHED;
1112 parseObject.setType(parseTypePos - counter, parseTypePos - 1, SourceType.NUM_CONSTANT);
1113 return;
1114 }
1115 if ((ch >= '0' && ch <= '9') || ch == '+' || ch == '-') {
1116 ++counter;
1117 parseState = ParseState.NUMERIC_CONSTANT;
1118 return;
1119 }
1120 if (isNumberDelimiter(ch)) {
1121 parseState = ParseState.START;
1122 parseObject.setType(parseTypePos - counter, parseTypePos - 1, SourceType.NUM_CONSTANT);
1123 counter = 0;
1124 return;
1125 }
1126 parseObject.setType(parseTypePos - counter, parseTypePos - 1, SourceType.NUM_CONSTANT);
1127 parseState = ParseState.NEUTRAL;
1128 counter = 0;
1129 return;
1130 }
1131 }
1132 }
|