Stanford parser学习：LexicalizedParser类分析

上次（http://www.cnblogs.com/stGeekpower/p/3457746.html）主要是对应于javadoc写了下LexicalizedParser类main函数的功能，这次看下main函数的具体处理过程。main函数大概350行左右，主要完成的工作是：初始化变量（各种标志位）、解析传入的各种参数、根据传入的选项参数分步骤完成各种工作。

根据选项来做的工作按顺序主要包括：分词（必须最先处理）、初始化LexicalizedParser（读入或训练）、编码设置、测试、保存（如果需要的话）、解析输出结果。

具体解析的部分：对句子解析是通过LexicalizedParser对象生成的ParserQuery类的parse函数来完成，对文件的解析由ParseFiles类的parseFiles函数（最终也是调用ParserQuery类）完成。

一、初始化变量

这部分主要处理申明一些标志位，以及构建解析器需要的变量；

 boolean train = false;//train or parse

 boolean saveToSerializedFile = false;//是否序列化存储至文件

 boolean saveToTextFile = false;//是否存储至文本文件

 String serializedInputFileOrUrl = null;//序列化输入文件或者url

 String textInputFileOrUrl = null;//文本输入文件或者url

 String serializedOutputFileOrUrl = null;//序列化输出文件或者url

 String textOutputFileOrUrl = null;//文本输入文件或者url

 String treebankPath = null;//语法树路径

 Treebank testTreebank = null;

 Treebank tuneTreebank = null;

 String testPath = null;

 FileFilter testFilter = null;

 String tunePath = null;

 FileFilter tuneFilter = null;

 FileFilter trainFilter = null;//训练过滤范围

 String secondaryTreebankPath = null;

double secondaryTreebankWeight = 1.0;

 FileFilter secondaryTrainFilter = null;

 // variables needed to process the files to be parsed

 TokenizerFactory<? extends HasWord> tokenizerFactory = null; //分词工厂

 String tokenizerOptions = null;//分词所需参数

 String tokenizerFactoryClass = null;//分词所用类

 String tokenizerMethod = null;//分词所用方法

 boolean tokenized = false; // whether or not the input file has already been tokenized

 Function<List<HasWord>, List<HasWord>> escaper = null; //转义

 String tagDelimiter = null; //分隔符

 String sentenceDelimiter = null;

 String elementDelimiter = null;

二、解析传入的各种参数

这里处理用户传入的各种选项参数，存入在一种申明的变量中；

 int argIndex = 0;

 if (args.length < 1) {//参数数量为0，错误返回

      System.err.println("Basic usage (see Javadoc for more): java edu.stanford.nlp.parser.lexparser" +

                    ".LexicalizedParser parserFileOrUrl filename*");

     return;

 }

 Options op = new Options(); //处理参数的对象

 List<String> optionArgs = new ArrayList<String>();

 String encoding = null;

 // while loop through option arguments,循环处理选项参数

 while (argIndex < args.length && args[argIndex].charAt(0) == '-') {

     if (args[argIndex].equalsIgnoreCase("-train") || args[argIndex].equalsIgnoreCase("-trainTreebank")) {//判断是否执行训练功能

          train = true;

         //处理训练时传入的参数信息，得到文件路径和过滤范围存至treebankDescription

         Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");

         argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;

         treebankPath = treebankDescription.first();

         trainFilter = treebankDescription.second();

         } else if (args[argIndex].equalsIgnoreCase("-train2")) {

             // TODO: we could use the fully expressive -train options if

             // we add some mechanism for returning leftover options from

             // ArgUtils.getTreebankDescription

             // train = true;     // cdm july 2005: should require -train for this

             int numSubArgs = ArgUtils.numSubArgs(args, argIndex);

             argIndex++;

             if (numSubArgs < 2) {

                 throw new RuntimeException("Error: -train2 <treebankPath> [<ranges>] <weight>.");

             }

             secondaryTreebankPath = args[argIndex++];

             secondaryTrainFilter = (numSubArgs == 3) ? new NumberRangesFileFilter(args[argIndex++], true) : null;

             secondaryTreebankWeight = Double.parseDouble(args[argIndex++]);

         } else if (args[argIndex].equalsIgnoreCase("-tLPP") && (argIndex + 1 < args.length)) {

              // 当使用除英文外的语言或者English Penn Treebank之外的Treebank时候需要指定TreebankLangParserParams，

                // 该选项必须出现在其他的与语言相关的选项之前。不同的语言有不同的参数

                try {

                    op.tlpParams = (TreebankLangParserParams) Class.forName(args[argIndex + 1]).newInstance();

              } catch (ClassNotFoundException e) {

                  System.err.println("Class not found: " + args[argIndex + 1]);

                  throw new RuntimeException(e);

              } catch (InstantiationException e) {

                  System.err.println("Couldn't instantiate: " + args[argIndex + 1] + ": " + e.toString());

                  throw new RuntimeException(e);

              } catch (IllegalAccessException e) {

                  System.err.println("Illegal access" + e);

                  throw new RuntimeException(e);

              }

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-encoding")) {//编码

                // sets encoding for TreebankLangParserParams

              // redone later to override any serialized parser one read in

              encoding = args[argIndex + 1];

              op.tlpParams.setInputEncoding(encoding);

              op.tlpParams.setOutputEncoding(encoding);

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-tokenized")) {//是否已经分词

                 tokenized = true;

               argIndex += 1;

          } else if (args[argIndex].equalsIgnoreCase("-escaper")) {

               try {

                   escaper = ReflectionLoading.loadByReflection(args[argIndex + 1]);

               } catch (Exception e) {

                   System.err.println("Couldn't instantiate escaper " + args[argIndex + 1] + ": " + e);

               }

               argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-tokenizerOptions")) {//指定TokenizerFactory类完成tokenization 所需要的参数信息

                tokenizerOptions = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-tokenizerFactory")) {//指定一个TokenizerFactory类来完成分词

                tokenizerFactoryClass = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-tokenizerMethod")) {//分词方法

                tokenizerMethod = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-sentences")) {//指定一个词语来划分句子边界,即分句根据

                sentenceDelimiter = args[argIndex + 1];

              if (sentenceDelimiter.equalsIgnoreCase("newline")) {

                  sentenceDelimiter = "\n";

              }

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-parseInside")) {//解析的范围，可以是句，几句等等

                elementDelimiter = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-tagSeparator")) {//指明标注符号

                tagDelimiter = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-loadFromSerializedFile") ||

              args[argIndex].equalsIgnoreCase("-model")) {

              // load the parser from a binary serialized file

              // the next argument must be the path to the parser file

              serializedInputFileOrUrl = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-loadFromTextFile")) {

              // load the parser from declarative text file

              // the next argument must be the path to the parser file

              textInputFileOrUrl = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-saveToSerializedFile")) {

              saveToSerializedFile = true;

              if (ArgUtils.numSubArgs(args, argIndex) < 1) {

                  System.err.println("Missing path: -saveToSerialized filename");

              } else {

                  serializedOutputFileOrUrl = args[argIndex + 1];

              }

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-saveToTextFile")) {

              // save the parser to declarative text file

              saveToTextFile = true;

              textOutputFileOrUrl = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-saveTrainTrees")) {

              // save the training trees to a binary file

              op.trainOptions.trainTreeFile = args[argIndex + 1];

              argIndex += 2;

          } else if (args[argIndex].equalsIgnoreCase("-treebank") ||

              args[argIndex].equalsIgnoreCase("-testTreebank") ||

              args[argIndex].equalsIgnoreCase("-test")) {//训练并测试，测试所需的参数

                Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-test");

              argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;

              testPath = treebankDescription.first();

              testFilter = treebankDescription.second();

          } else if (args[argIndex].equalsIgnoreCase("-tune")) {

              Pair<String, FileFilter> treebankDescription = ArgUtils.getTreebankDescription(args, argIndex, "-tune");

              argIndex = argIndex + ArgUtils.numSubArgs(args, argIndex) + 1;

              tunePath = treebankDescription.first();

              tuneFilter = treebankDescription.second();

          } else {

              int oldIndex = argIndex;

              argIndex = op.setOptionOrWarn(args, argIndex);

              for (int i = oldIndex; i < argIndex; i++) {

                  optionArgs.add(args[i]);

              }

          }

 } // end while loop through arguments

三、分词处理

句法分析的前提是句子已经被正确分词，这里即完成分词工作，当然分词我们可以选用自己合适的分词器；

// set up tokenizerFactory with options if provided

        if (tokenizerFactoryClass != null || tokenizerOptions != null) {

            try {//分词工厂类、分词方法由参数指定，若不指定，默认PTBTokenizer

                if (tokenizerFactoryClass != null) {

                    Class<TokenizerFactory<? extends HasWord>> clazz = ErasureUtils.uncheckedCast(Class.forName

                            (tokenizerFactoryClass));

                    Method factoryMethod;

                    if (tokenizerOptions != null) {

                        factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod :

                                "newWordTokenizerFactory", String.class);

                        tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null, tokenizerOptions));

                    } else {

                        factoryMethod = clazz.getMethod(tokenizerMethod != null ? tokenizerMethod :

                                "newTokenizerFactory");

                        tokenizerFactory = ErasureUtils.uncheckedCast(factoryMethod.invoke(null));

                    }

                } else {

                    // have options but no tokenizer factory; default to PTB

                    tokenizerFactory = PTBTokenizer.PTBTokenizerFactory.newWordTokenizerFactory(tokenizerOptions);

                }

            } catch (IllegalAccessException e) {

                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +

                        "" + tokenizerOptions);

                throw new RuntimeException(e);

            } catch (NoSuchMethodException e) {

                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +

                        "" + tokenizerOptions);

                throw new RuntimeException(e);

            } catch (ClassNotFoundException e) {

                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +

                        "" + tokenizerOptions);

                throw new RuntimeException(e);

            } catch (InvocationTargetException e) {

                System.err.println("Couldn't instantiate TokenizerFactory " + tokenizerFactoryClass + " with options " +

                        "" + tokenizerOptions);

                throw new RuntimeException(e);

            }

四、初始化LexicalizedParser

初始化LexicalizedParser有三种方式，分别是：根据数据训练一个，从文本文件读入，从序列化文件读入；

if (tuneFilter != null || tunePath != null) {//处理tune treebank

            if (tunePath == null) {

                if (treebankPath == null) {

                    throw new RuntimeException("No tune treebank path specified...");

                } else {

                    System.err.println("No tune treebank path specified.  Using train path: \"" + treebankPath + '\"');

                    tunePath = treebankPath;

                }

            }

            tuneTreebank = op.tlpParams.testMemoryTreebank();

            tuneTreebank.loadPath(tunePath, tuneFilter);

        }

        if (!train && op.testOptions.verbose) {

            StringUtils.printErrInvocationString("LexicalizedParser", args);

        }

        edu.stanford.nlp.parser.lexparser.LexicalizedParser lp; // always initialized in next if-then-else block

        if (train) {

            StringUtils.printErrInvocationString("LexicalizedParser", args);

            // so we train a parser using the treebank

            GrammarCompactor compactor = null;

            if (op.trainOptions.compactGrammar() == 3) {

                compactor = new ExactGrammarCompactor(op, false, false);

            }

            Treebank trainTreebank = makeTreebank(treebankPath, op, trainFilter);

            Treebank secondaryTrainTreebank = null;

            if (secondaryTreebankPath != null) {

                secondaryTrainTreebank = makeSecondaryTreebank(secondaryTreebankPath, op, secondaryTrainFilter);

            }

            List<List<TaggedWord>> extraTaggedWords = null;

            if (op.trainOptions.taggedFiles != null) {

                extraTaggedWords = new ArrayList<List<TaggedWord>>();

                List<TaggedFileRecord> fileRecords = TaggedFileRecord.createRecords(new Properties(),

                        op.trainOptions.taggedFiles);

                for (TaggedFileRecord record : fileRecords) {

                    for (List<TaggedWord> sentence : record.reader()) {

                        extraTaggedWords.add(sentence);

                    }

                }

            }

            //执行训练方法时对lp的初始化，根据标注数据训练出lp

            lp = getParserFromTreebank(trainTreebank, secondaryTrainTreebank, secondaryTreebankWeight, compactor, op,

                    tuneTreebank, extraTaggedWords);

        } else if (textInputFileOrUrl != null) {

            // so we load the parser from a text grammar file,直接从文本文件中读入lp

            lp = getParserFromTextFile(textInputFileOrUrl, op);

        } else {

            // so we load a serialized parser,从序列化保存的文件中读入lp

            if (serializedInputFileOrUrl == null && argIndex < args.length) {

                // the next argument must be the path to the serialized parser

                serializedInputFileOrUrl = args[argIndex];

                argIndex++;

            }

            if (serializedInputFileOrUrl == null) {

                System.err.println("No grammar specified, exiting...");

                return;

            }

            String[] extraArgs = new String[optionArgs.size()];

            extraArgs = optionArgs.toArray(extraArgs);

            try {

                lp = loadModel(serializedInputFileOrUrl, op, extraArgs);

                op = lp.op;

            } catch (IllegalArgumentException e) {

                System.err.println("Error loading parser, exiting...");

                throw e;

            }

        }

五、控制编码

 // the following has to go after reading parser to make sure

 // op and tlpParams are the same for train and test

 // THIS IS BUTT UGLY BUT IT STOPS USER SPECIFIED ENCODING BEING

 // OVERWRITTEN BY ONE SPECIFIED IN SERIALIZED PARSER

 if (encoding != null) {

    op.tlpParams.setInputEncoding(encoding);

    op.tlpParams.setOutputEncoding(encoding);

 }

六、测试数据设置

       if (testFilter != null || testPath != null) {

            if (testPath == null) {

                if (treebankPath == null) {

                    throw new RuntimeException("No test treebank path specified...");

                } else {

                    System.err.println("No test treebank path specified.  Using train path: \"" + treebankPath + '\"');

                    testPath = treebankPath;

                }

            }

            testTreebank = op.tlpParams.testMemoryTreebank();

            testTreebank.loadPath(testPath, testFilter);

        }

七、需要的话将训练生成的解析器保存

        op.trainOptions.sisterSplitters = Generics.newHashSet(Arrays.asList(op.tlpParams.sisterSplitters()));

        // at this point we should be sure that op.tlpParams is

        // set appropriately (from command line, or from grammar file),

        // and will never change again.  -- Roger

        // Now what do we do with the parser we've made

        if (saveToTextFile) {

            // save the parser to textGrammar format

            if (textOutputFileOrUrl != null) {

                lp.saveParserToTextFile(textOutputFileOrUrl);

            } else {

                System.err.println("Usage: must specify a text grammar output path");

            }

        }

        if (saveToSerializedFile) {

            if (serializedOutputFileOrUrl != null) {

                lp.saveParserToSerialized(serializedOutputFileOrUrl);

            } else if (textOutputFileOrUrl == null && testTreebank == null) {

                // no saving/parsing request has been specified

                System.err.println("usage: " + "java edu.stanford.nlp.parser.lexparser.LexicalizedParser " + "-train " +

                        "trainFilesPath [fileRange] -saveToSerializedFile serializedParserFilename");

            }

        }

八、训练或者指定输入参数时，输出一些信息

        if (op.testOptions.verbose || train) {

            // Tell the user a little or a lot about what we have made

            // get lexicon size separately as it may have its own prints in it....

            String lexNumRules = lp.lex != null ? Integer.toString(lp.lex.numRules()) : "";

            System.err.println("Grammar\tStates\tTags\tWords\tUnaryR\tBinaryR\tTaggings");

            System.err.println("Grammar\t" +

                    lp.stateIndex.size() + '\t' +

                    lp.tagIndex.size() + '\t' +

                    lp.wordIndex.size() + '\t' +

                    (lp.ug != null ? lp.ug.numRules() : "") + '\t' +

                    (lp.bg != null ? lp.bg.numRules() : "") + '\t' +

                    lexNumRules);

            System.err.println("ParserPack is " + op.tlpParams.getClass().getName());

            System.err.println("Lexicon is " + lp.lex.getClass().getName());

            if (op.testOptions.verbose) {

                System.err.println("Tags are: " + lp.tagIndex);

                // System.err.println("States are: " + lp.pd.stateIndex); // This is too verbose. It was already

                // printed out by the below printOptions command if the flag -printStates is given (at training time)!

            }

            printOptions(false, op);

        }

九、执行解析工作

可以以句子的方式解析，也可用ParseFiles类的方法来解析多个文件。

        if (testTreebank != null) {

            // test parser on treebank

            EvaluateTreebank evaluator = new EvaluateTreebank(lp);

            evaluator.testOnTreebank(testTreebank);

        } else if (argIndex >= args.length) {

            // no more arguments, so we just parse our own test sentence

            PrintWriter pwOut = op.tlpParams.pw();

            PrintWriter pwErr = op.tlpParams.pw(System.err);

            ParserQuery pq = lp.parserQuery();

            if (pq.parse(op.tlpParams.defaultTestSentence())) {//解析

                lp.getTreePrint().printTree(pq.getBestParse(), pwOut);

            } else {

                pwErr.println("Error. Can't parse test sentence: " +

                        op.tlpParams.defaultTestSentence());

            }

        } else {

            // We parse filenames given by the remaining arguments，解析

            ParseFiles.parseFiles(args, argIndex, tokenized, tokenizerFactory, elementDelimiter, sentenceDelimiter,

                    escaper, tagDelimiter, op, lp.getTreePrint(), lp);

        }