package edu.byu.deg.framework.dsp.vsm;

import com.hp.hpl.jena.sparql.sse.Tags;
import edu.byu.deg.framework.Document;
import edu.byu.deg.framework.DocumentStructureParser;
import edu.byu.deg.framework.OntologySubscriber;
import edu.byu.deg.framework.UnrecognizedFormatException;
import edu.byu.deg.framework.document.DOMDocument;
import edu.byu.deg.framework.dsp.heuristic.CombinedHeuristic;
import edu.byu.deg.osmxwrappers.OSMXDocument;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Set;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.tidy.Tidy;

/* loaded from: input_file:edu/byu/deg/framework/dsp/vsm/VSMRecordSeparator.class */
public class VSMRecordSeparator implements DocumentStructureParser, OntologySubscriber {
    public static final String SEPARATOR_TAG = "record";
    public static final double CHILD_RECORD_THRESHOLD = 0.66d;
    public static final double MIN_CHILDREN_TO_KEEP_ALL = 4.0d;
    static Tidy tidy = null;
    private VSMManager manager;
    private OSMXDocument ontDoc;

    public VSMRecordSeparator(OSMXDocument oSMXDocument) {
        this();
        setOntology(oSMXDocument);
    }

    public VSMRecordSeparator() {
        if (tidy == null) {
            tidy = new Tidy();
            tidy.setShowWarnings(false);
            tidy.setQuiet(true);
            tidy.setCharEncoding(3);
        }
    }

    @Override // edu.byu.deg.framework.DocumentStructureParser
    public Document parse(Document document) throws UnrecognizedFormatException {
        if (!(document instanceof DOMDocument)) {
            throw new UnrecognizedFormatException("RecordSeparator cannot handle this document");
        }
        DOMDocument dOMDocument = (DOMDocument) document;
        this.manager.calculateVSM(dOMDocument.getDOMString(), dOMDocument.getRoot());
        Node root = dOMDocument.getRoot();
        dOMDocument.setConfidence(this.manager.getCos(root));
        if (this.manager.isDocumentSplitable(root)) {
            return parse(dOMDocument.getRoot(), dOMDocument);
        }
        dOMDocument.addSubDocument(root);
        return dOMDocument;
    }

    private Set<Node> recombine(Node node, ArrayList<Node> arrayList, DOMDocument dOMDocument) {
        Node node2;
        HashSet hashSet = new HashSet();
        if (arrayList.size() > 0) {
            Node node3 = arrayList.get(0);
            int i = 1;
            while (i < arrayList.size()) {
                Node node4 = arrayList.get(i);
                if (this.manager.isSplitable(node3) || this.manager.isSplitable(node4) || !this.manager.canMerge(node3, node4)) {
                    node2 = node4;
                } else {
                    Element createElement = dOMDocument.getParentDocument().createElement("record");
                    hashSet.add(createElement);
                    try {
                        node.insertBefore(createElement, node3);
                        node.removeChild(node3);
                        node.removeChild(node4);
                        createElement.appendChild(node3);
                        createElement.appendChild(node4);
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    arrayList.set(i - 1, createElement);
                    int i2 = i;
                    i--;
                    arrayList.remove(i2);
                    node2 = createElement;
                }
                node3 = node2;
                i++;
            }
        }
        return hashSet;
    }

    private boolean getRecords(ArrayList<Node> arrayList, ArrayList<Node> arrayList2, ArrayList<Node> arrayList3) {
        int i = 0;
        for (int i2 = 0; i2 < arrayList.size(); i2++) {
            Node node = arrayList.get(i2);
            if (this.manager.containsRecord(node)) {
                if (!this.manager.isSplitable(node)) {
                    i++;
                }
                arrayList2.add(node);
            } else {
                arrayList3.add(node);
            }
        }
        if (arrayList.size() < 4.0d || i <= arrayList.size() * 0.66d) {
            return false;
        }
        System.out.println("using all");
        arrayList2.clear();
        arrayList2.addAll(arrayList);
        while (!this.manager.containsRecord(arrayList2.get(0))) {
            arrayList2.remove(0);
        }
        while (!this.manager.containsRecord(arrayList2.get(arrayList2.size() - 1))) {
            arrayList2.remove(arrayList2.size() - 1);
        }
        return true;
    }

    private Document parse(Node node, DOMDocument dOMDocument) {
        if (this.manager.isSplitable(node)) {
            node.getFirstChild();
            ArrayList<Node> split = split(node, dOMDocument.getParentDocument());
            if (split.contains(node.getNextSibling())) {
                dOMDocument.addSubDocument(node);
                return dOMDocument;
            }
            Set<Node> recombine = recombine(node, split, dOMDocument);
            ArrayList<Node> arrayList = new ArrayList<>();
            ArrayList<Node> arrayList2 = new ArrayList<>();
            boolean records = getRecords(split, arrayList2, arrayList);
            int subDocumentCount = dOMDocument.getSubDocumentCount();
            for (int i = 0; i < arrayList2.size(); i++) {
                Node node2 = arrayList2.get(i);
                if (records || recombine.contains(node2)) {
                    dOMDocument.addSubDocument(node2);
                } else {
                    parse(node2, dOMDocument);
                }
            }
            int subDocumentCount2 = dOMDocument.getSubDocumentCount() - subDocumentCount;
            if (subDocumentCount2 == 0) {
                dOMDocument.addSubDocument(node);
            }
            if (subDocumentCount2 == 1) {
                dOMDocument.setSubDocument(dOMDocument.getSubDocumentCount() - 1, node);
            }
        } else {
            dOMDocument.addSubDocument(node);
        }
        return dOMDocument;
    }

    private static ArrayList<Node> split(Node node, org.w3c.dom.Document document) {
        String separator = new CombinedHeuristic(node).getSeparator();
        if (!separator.equals("")) {
            return split(node, separator, document);
        }
        ArrayList<Node> arrayList = new ArrayList<>();
        Node firstChild = node.getFirstChild();
        while (true) {
            Node node2 = firstChild;
            if (node2 == null) {
                return arrayList;
            }
            arrayList.add(node2);
            firstChild = node2.getNextSibling();
        }
    }

    private static ArrayList<Node> split(Node node, String str, org.w3c.dom.Document document) {
        System.out.println("Splitting on <" + str + Tags.symGT);
        ArrayList<Node> arrayList = new ArrayList<>();
        Element createElement = document.createElement("record");
        boolean z = false;
        Node firstChild = node.getFirstChild();
        while (firstChild != null) {
            if (z && firstChild.getNodeName().toLowerCase().equals(str)) {
                arrayList.add(createElement);
                createElement = document.createElement("record");
            }
            Node node2 = firstChild;
            firstChild = firstChild.getNextSibling();
            node.removeChild(node2);
            createElement.appendChild(node2);
            z = true;
        }
        if (z) {
            arrayList.add(createElement);
        }
        for (int i = 0; i < arrayList.size(); i++) {
            try {
                node.appendChild(arrayList.get(i));
            } catch (Exception e) {
            }
        }
        return arrayList;
    }

    @Override // edu.byu.deg.framework.OntologySubscriber
    public OSMXDocument getOntology() {
        return this.ontDoc;
    }

    @Override // edu.byu.deg.framework.OntologySubscriber
    public void setOntology(OSMXDocument oSMXDocument) {
        this.ontDoc = oSMXDocument;
        this.manager = new VSMManager(this.ontDoc);
    }
}
