package edu.harvard.wcfia.yoshikonverter;

import edu.harvard.wcfia.yoshikonverter.util.FileUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.regex.Pattern;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/* loaded from: input_file:edu/harvard/wcfia/yoshikonverter/Converter.class */
public class Converter {
    private static PDFTextStripper stripper;
    private static String parserName;
    private static XMLReader parser;
    private static Pattern windowsLinebreaks;
    private static Pattern tabs;

    /* loaded from: input_file:edu/harvard/wcfia/yoshikonverter/Converter$RemoveMarkupHandler.class */
    static class RemoveMarkupHandler extends DefaultHandler {
        private StringBuffer sb = new StringBuffer();
        private boolean ignoreScript = false;
        private boolean ignoreStyle = false;

        RemoveMarkupHandler() {
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) {
            if (this.ignoreScript || this.ignoreStyle) {
                return;
            }
            this.sb.append(new String(cArr, i, i2) + " ");
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ErrorHandler
        public void error(SAXParseException sAXParseException) throws SAXException {
            sAXParseException.printStackTrace();
        }

        public String getContent() {
            return this.sb.toString();
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            if (str2.toLowerCase().equals("script")) {
                this.ignoreScript = true;
            } else if (str2.toLowerCase().equals("style")) {
                this.ignoreStyle = true;
            }
            super.startElement(str, str2, str3, attributes);
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if (str2.toLowerCase().equals("script")) {
                this.ignoreScript = false;
            } else if (str2.toLowerCase().equals("style")) {
                this.ignoreStyle = false;
            }
            super.endElement(str, str2, str3);
        }
    }

    public static String inhaleHtml(File file, String str) throws IOException, SAXException {
        if (parser == null) {
            parser = XMLReaderFactory.createXMLReader(parserName);
        }
        InputSource inputSource = new InputSource(new InputStreamReader(new FileInputStream(file), str));
        RemoveMarkupHandler removeMarkupHandler = new RemoveMarkupHandler();
        parser.setContentHandler(removeMarkupHandler);
        parser.parse(inputSource);
        String[] split = compactSpacesAndTabs(compactWindowsLinebreaks(removeMarkupHandler.getContent())).split("[\r\n\f]");
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        for (String str2 : split) {
            String trim = str2.trim();
            if (trim.length() == 0) {
                if (i == 2) {
                    stringBuffer.append("\n");
                }
                i++;
            } else {
                stringBuffer.append(trim);
                stringBuffer.append("\n");
                i = 0;
            }
        }
        return stringBuffer.toString();
    }

    public static String compactSpacesAndTabs(String str) {
        return tabs.matcher(str).replaceAll("    ");
    }

    public static String compactWindowsLinebreaks(String str) {
        return windowsLinebreaks.matcher(str).replaceAll(" \n");
    }

    private static String removeXmlComments(String str) {
        int i = 0;
        StringBuffer stringBuffer = new StringBuffer();
        while (true) {
            int indexOf = str.indexOf("<!--", i);
            if (indexOf == -1) {
                stringBuffer.append(str.substring(i, str.length()));
                break;
            }
            stringBuffer.append(str.substring(i, indexOf));
            int indexOf2 = str.indexOf("-->", indexOf + 4);
            if (indexOf2 == -1) {
                break;
            }
            i = indexOf2 + 3;
        }
        return stringBuffer.toString();
    }

    public static String inhaleMSFormat(File file) throws IOException {
        FileInputStream fileInputStream = null;
        try {
            fileInputStream = new FileInputStream(file);
            String text = new HWPFDocument(fileInputStream).getRange().text();
            if (fileInputStream != null) {
                fileInputStream.close();
            }
            return text;
        } catch (Throwable th) {
            if (fileInputStream != null) {
                fileInputStream.close();
            }
            throw th;
        }
    }

    public static String inhalePdf(File file) throws IOException {
        PDDocument pDDocument = null;
        try {
            PDDocument load = PDDocument.load(file);
            if (load.isEncrypted()) {
                throw new IOException(file.getName() + " is encrypted");
            }
            String text = stripper.getText(load);
            if (load != null) {
                load.close();
            }
            return text.replaceAll("(\\w+)-[ ]*\\n[ ]*(\\w+)", "$1$2");
        } catch (Throwable th) {
            if (0 != 0) {
                pDDocument.close();
            }
            throw th;
        }
    }

    public static String inhale(File file, String str) throws IOException {
        return str == null ? new String(FileUtil.getBytes(file)) : new String(FileUtil.getBytes(file), str);
    }

    public static void main(String[] strArr) {
        System.out.println("Advert-\n  ising and other han- \ndy things, but -\n and this".replaceAll("(\\w+)-[ ]*\\n[ ]*(\\w+)", "$1$2"));
        try {
            System.out.println("[" + removeXmlComments("<!-- foo -->and bar<!--comment") + "]");
            System.out.println("[" + removeXmlComments("and bar <!-- comment --><!--") + "]");
            System.out.println("[" + removeXmlComments("and <!-- other -->bar<!--- -->") + "]");
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    static {
        try {
            stripper = new PDFTextStripper();
        } catch (Exception e) {
            e.printStackTrace();
        }
        parserName = "org.ccil.cowan.tagsoup.Parser";
        windowsLinebreaks = Pattern.compile("(\r\n)+");
        tabs = Pattern.compile("[\t]{2,}+");
    }
}
