NewickTreeParser.java

/*
 * Copyright 2013 University of Glasgow.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package broadwick.phylo;

import broadwick.graph.Edge;
import broadwick.graph.Tree;
import broadwick.io.FileInput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;

/**
 * Parser for a Newick tree. This has limited functionality and only works for Newick trees <ul> <li> with unique node
 * names (e.g. one node can be unnamed but the rest must be uniquely named)</li> <li> with defined branch lengths (e.g.
 * (A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F:0.0;)</li> </ul>
 */
@Slf4j
public class NewickTreeParser {

    /**
     * Create the parser.
     * @param newickFile the name of the file containing the tree.
     */
    public NewickTreeParser(final String newickFile) {
        try (FileInput file = new FileInput(newickFile)) {
            newickString = file.read();
        } catch (IOException ex) {
            log.error("Could not read Newick file {}. {}", newickFile, ex.getLocalizedMessage());
        }
    }

    /**
     * Create the parser.
     */
    public NewickTreeParser() {
        newickString = null;
    }

    /**
     * Parse a given string that MUST be a valid Newick format.
     * @param newickStr the string to parse.
     * @return a Tree created from the parsed string.
     */
    public final Tree<PhyloNode, Edge<PhyloNode>> parse(final String newickStr) {
        branchLabel = 0;

        final Tree<PhyloNode, Edge<PhyloNode>> phyloTree = new Tree<>();
        // first remove the trailing ; if any
        String stringToParse = StringUtils.removeEnd(newickStr.trim(), ";").trim();

        if (stringToParse.charAt(0) == '(' && getClosingParenthesis(stringToParse) == stringToParse.length() - 1) {
            stringToParse = stringToParse.substring(1, stringToParse.length() - 1);
        }

        // extract the information for the root node, this will be after the last closing parenthesis iff there is
        // a colon (TODO is this right? e.g. (A,B,(C,D),E)F; )
        double distance = 0.0;
        String rootName = "";
        final int rparen = stringToParse.lastIndexOf(')');
        final int rcolon = stringToParse.lastIndexOf(':');
        if (rparen < rcolon) {
            distance = Double.parseDouble(stringToParse.substring(rcolon + 1));
            rootName = stringToParse.substring(rparen + 1, rcolon);
            stringToParse = stringToParse.substring(0, rcolon - rootName.length()).trim(); // adjust the string to exclude the distance
        }

        // create a new node for this root and add it to the tree.
        final PhyloNode root = new PhyloNode(rootName, distance);
        log.trace("Parsing Newick file: Adding root node {}", root);
        phyloTree.addVertex(root);

        // if we don't have a comma we are just have the root node
        if (stringToParse.indexOf(',') == -1) {
            return phyloTree;
        }

        if (stringToParse.charAt(0) == '(' && getClosingParenthesis(stringToParse) == stringToParse.length() - 1) {
            stringToParse = stringToParse.substring(1, stringToParse.length() - 1);
        }

        parseString(stringToParse, root, phyloTree);
        return phyloTree;
    }

    /**
     * Parse a string from a Newick file.
     * @return a generated Tree object.
     */
    public final Tree<PhyloNode, Edge<PhyloNode>> parse() {
        if (newickString == null) {
            throw new IllegalArgumentException("NewickParser created but no string to parse found. Did you create the parser without specifying a file?");
        }
        return parse(newickString);
    }

    /**
     * Parse a string in Newick format (where the root node has been removed) recursively.
     * @param stringToParse the string to be parsed.
     * @param parent        the parent node for the string (initially this is the root node but will be a brach node if
     *                      this method is called recursively).
     * @param tree          the tree to which the created nodes will be attached.
     */
    private void parseString(final String stringToParse, final PhyloNode parent,
                             final Tree<PhyloNode, Edge<PhyloNode>> tree) {
        for (String node : findNodes(stringToParse)) {
            parseNode(node, tree, parent);
        }
    }

    /**
     * Parse a tree splitting it into top level nodes and subtrees.
     * @param tree the tree tp be parsed.
     * @return a collection of nodes and subtrees contained in the parsed string
     */
    private Collection<String> findNodes(final String tree) {
        final Collection<String> nodes = new ArrayList<>();

        int start = 0;
        int depth = 0;
        boolean isOk = true;
        for (int i = 0; i < tree.length(); i++) {
            final char charAt = tree.charAt(i);
            if (charAt == '(') {
                depth++;
                isOk = false;
            }
            if (charAt == ')' && --depth == 0) {
                isOk = true;
            }

            if (charAt == ',' && isOk) {
                nodes.add(tree.substring(start, i).trim());
                start = i + 1;
            }
        }
        nodes.add(tree.substring(start).trim());

        return nodes;
    }

    /**
     * Get the index of the closing parenthesis in a string that starts with an opening parenthesis. If the string does
     * not start with an opening parenthesis then an IllegalArgumentException is thrown. The algorithm for finding the
     * closing parenthesis is simple - keep reading the string incrementing a count of the number of opening parentheses
     * we find and decrementing the count if we encounter a closing parenthesis. When we read a closing parenthesis AND
     * the count becomes zero we have our closing parenthesis.
     * @param strng the string to examine.
     * @return the index in the string of the closing parenthesis, -1 if none is found.
     */
    private int getClosingParenthesis(final String strng) {

        if (!strng.trim().startsWith("(")) {
            throw new IllegalArgumentException(String.format("Illegal Argument [%s] does not start with an opening parenthesis", strng));
        }

        int depth = 0;
        for (int i = 0; i < strng.length(); i++) {
            if (strng.charAt(i) == '(') {
                depth++;
            }
            if (strng.charAt(i) == ')' && (--depth == 0)) {
                return i;
            }
        }
        return -1;
    }

    /**
     * Parse a string containing information on a node in Newick format and attach it to a given tree.
     * @param node   the string containing the node information.
     * @param tree   the tree to which the created node will be attached.
     * @param parent the node on the tree to which the new node will be created.
     */
    private void parseNode(final String node, final Tree<PhyloNode, Edge<PhyloNode>> tree, final PhyloNode parent) {
        log.trace("parsing {} from parent node {} ", node, parent.getId());
        if (node.charAt(0) == '(') {
            // this is a branch so create a branch node and set the parent
            final int rparen = node.lastIndexOf(')');
            final PhyloNode branchNode = addNodeToTree(node.substring(rparen + 1), tree, parent, true);
            for (String nd : findNodes(node.substring(1, rparen))) {
                parseNode(nd, tree, branchNode);
            }
        } else if (node.indexOf(',') != -1) {
            for (String nd : findNodes(node)) {
                parseNode(nd, tree, parent);
            }
        } else {
            // this is a single node
            addNodeToTree(node, tree, parent, false);
        }
    }

    /**
     * Given a string that represents a node in the tree, split in into its constituent name and distance components and
     * add it to the tree. If the string is empty then a dummy node is added, this is useful fro creating (unnamed)
     * branches.
     * @param node             the string representing the node.
     * @param tree             the tree object on which the node will be placed.
     * @param parent           the parent node to which this node is attached.
     * @param createUniqueName if the name of the node is not unique then add create a unique one if this is true.
     * @return the node added to the tree.
     */
    private PhyloNode addNodeToTree(final String node, final Tree<PhyloNode, Edge<PhyloNode>> tree, final PhyloNode parent,
                                    final boolean createUniqueName) {
        log.trace("Adding {} to tree at {}.", node, parent);
        PhyloNode phyloNode;

        final int lcolon = node.indexOf(':');
        String nodeName;
        double distance = 0.0;

        if (node.isEmpty()) {
            // no details so use default values.
            nodeName = String.format("branch-%d", branchLabel++);
            distance = 0.0;
        } else if (lcolon == -1) {
            // we just have a node name and no distance.
            distance = 0.0;
            nodeName = node;
        } else {
            // we have a node name and a distance.
            distance = Double.parseDouble(node.substring(lcolon + 1));
            nodeName = node.substring(0, lcolon);
        }

        phyloNode = new PhyloNode(nodeName, distance);

        try {
            tree.addEdge(new Edge<>(parent, phyloNode, distance), parent, phyloNode);
        } catch (IllegalArgumentException e) {
            // we may have non-unique branch names in the data file, if we encounter one here and we want to rename
            // it we do it now. This is not very good as we are searching for an error message but it's the best as can
            // be done.
            if (e.getLocalizedMessage().contains("already exists in this graph") && createUniqueName) {
                nodeName = String.format("branch-%d", branchLabel++);
                phyloNode = new PhyloNode(nodeName, distance);
                tree.addEdge(new Edge<>(parent, phyloNode), parent, phyloNode);
            }
        }

        return phyloNode;
    }
    private String newickString;
    private int branchLabel = 0;
}