package uk.ac.cam.ch.wwmm.chemicaltagger;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.parser.Parse;
import org.apache.commons.cli.HelpFormatter;
import uk.ac.cam.ch.wwmm.oscar.document.Token;
import uk.ac.cam.ch.wwmm.oscar.tools.StringTools;

/* loaded from: input_file:uk/ac/cam/ch/wwmm/chemicaltagger/Formatter.class */
public class Formatter {
    private static Pattern MATCH_SULPH = Pattern.compile("sulph", 2);
    private static Pattern MATCH_DEGREES_WHITESPACE = Pattern.compile("([°º])(\\s+)([cCfF]([.,;:()\\[\\]{}\\s]|$))");
    private static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
    private static final Pattern matchSpaceChargeOrOxidationSpecifier = Pattern.compile(" \\((\\d[+\\-]|[+\\-]\\d|0|I{1,3}|IV|VI{1,3}|IX)\\)", 2);
    private static final List<String> elements = Arrays.asList("hydrogen", "lithium", "sodium", "natrium", "potassium", "kalium", "rubidium", "caesium", "cesium", "francium", "beryllium", "magnesium", "calcium", "strontium", "barium", "radium", "aluminium", "aluminum", "gallium", "indium", "thallium", "tin", "stannum", "lead", "plumbum", "bismuth", "polonium", "scandium", "titanium", "vanadium", "chromium", "manganese", "iron", "cobalt", "nickel", "copper", "zinc", "yttrium", "zirconium", "niobium", "molybdenum", "technetium", "ruthenium", "rhodium", "palladium", "silver", "cadmium", "lanthanum", "cerium", "praseodymium", "neodymium", "promethium", "samarium", "europium", "gadolinium", "terbium", "dysprosium", "holmium", "erbium", "thulium", "ytterbium", "lutetium", "hafnium", "tantalum", "tungsten", "wolfram", "rhenium", "osmium", "iridium", "platinum", "gold", "mercury", "hydrargyrum", "actinium", "thorium", "protactinium", "uranium", "neptunium", "plutonium", "americium", "curium", "berkelium", "californium", "einsteinium", "fermium", "mendelevium", "nobelium", "lawrencium", "rutherfordium", "boron", "carbon", "silicon", "germanium", "nitrogen", "phosphorus", "arsenic", "antimony", "stibium", "oxygen", "sulfur", "selenium", "tellurium", "polonium", "fluorine", "chlorine", "bromine", "iodine", "astatine", "helium", "neon", "argon", "krypton", "xenon", "radon");
    private static Pattern CONCAT_AMOUNT_PATTERN = Pattern.compile("[~]?\\d*\\.?(\\d(\\d+|\\.\\d+|\\d*[mkµu])(g|l|hPa)[s]?|(\\d+[mnkµu]?([LMN]|[eE][qQ][\\.]?|[cCdD][mM]3|[gG][rR][aA][mM][mM]?[eE]?|[mM][oO][lL][eE]?|[mM][oO][lL][aA][rR])[sS]?))$");
    private static Pattern CONCAT_PH_PATTERN = Pattern.compile("^pH-?\\d+");
    private static Pattern CONCAT_TEMP_PATTERN = Pattern.compile("\\d+(o|°|º)[cCfF][\\.]?");
    private static Pattern CONCAT_PERCENTAGE_PATTERN = Pattern.compile("([^%]*)(%)([^%]*)");

    private Formatter() {
    }

    public static String normaliseText(String str) {
        return removeSpaceBetweenElementsAndChargeOrOxidationStateSpecifier(WHITESPACE_PATTERN.matcher(MATCH_DEGREES_WHITESPACE.matcher(MATCH_SULPH.matcher(str.replace("‐", HelpFormatter.DEFAULT_OPT_PREFIX).replace("‑", HelpFormatter.DEFAULT_OPT_PREFIX).replace("‒", HelpFormatter.DEFAULT_OPT_PREFIX).replace(StringTools.enDash, HelpFormatter.DEFAULT_OPT_PREFIX).replace(StringTools.emDash, HelpFormatter.DEFAULT_OPT_PREFIX).replace("―", HelpFormatter.DEFAULT_OPT_PREFIX).replace(HelpFormatter.DEFAULT_OPT_PREFIX, HelpFormatter.DEFAULT_OPT_PREFIX).replace("−", HelpFormatter.DEFAULT_OPT_PREFIX).replace("μ", "µ")).replaceAll("sulf")).replaceAll("$2$1$3")).replaceAll(" "));
    }

    private static String removeSpaceBetweenElementsAndChargeOrOxidationStateSpecifier(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        Matcher matcher = matchSpaceChargeOrOxidationSpecifier.matcher(str);
        while (matcher.find()) {
            String substring = str.substring(0, matcher.start());
            Iterator<String> it = elements.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                if (StringTools.endsWithCaseInsensitive(substring, it.next())) {
                    matcher.appendReplacement(stringBuffer, matcher.group().substring(1));
                    break;
                }
            }
        }
        matcher.appendTail(stringBuffer);
        return stringBuffer.toString();
    }

    public static List<Token> subTokeniseTokens(List<Token> list) {
        int i = 0;
        while (i < list.size()) {
            Token token = list.get(i);
            String[] subTokenize = subTokenize(token.getSurface());
            if (subTokenize != null) {
                int start = token.getStart();
                ArrayList arrayList = new ArrayList();
                for (String str : subTokenize) {
                    int length = start + str.length();
                    arrayList.add(new Token(str, start, length, token.getDoc(), token.getBioType(), token.getNeElem()));
                    start = length;
                }
                list.remove(i);
                list.addAll(i, arrayList);
            } else {
                i++;
            }
        }
        int i2 = 0;
        Iterator<Token> it = list.iterator();
        while (it.hasNext()) {
            int i3 = i2;
            i2++;
            it.next().setIndex(i3);
        }
        return list;
    }

    private static String[] subTokenize(String str) {
        if (str.length() <= 1) {
            return null;
        }
        if (CONCAT_AMOUNT_PATTERN.matcher(str).matches()) {
            return splitAmounts(str);
        }
        if (CONCAT_PH_PATTERN.matcher(str).find()) {
            return new String[]{str.substring(0, 2), str.substring(2)};
        }
        Matcher matcher = CONCAT_TEMP_PATTERN.matcher(str);
        if (matcher.find()) {
            int start = matcher.start(1);
            return new String[]{str.substring(0, start), str.substring(start)};
        }
        if (str.contains("%")) {
            return splitPercentageSign(str);
        }
        if (str.length() > 2 && str.startsWith(Parse.BRACKET_LRB) && str.endsWith(Parse.BRACKET_RRB)) {
            return new String[]{str.substring(0, 1), str.substring(1, str.length() - 1), str.substring(str.length() - 1)};
        }
        return null;
    }

    private static String[] splitAmounts(String str) {
        int length = str.length();
        int i = 0;
        while (true) {
            if (i >= str.length()) {
                break;
            }
            if (Character.isLetter(str.charAt(i))) {
                length = i;
                break;
            }
            i++;
        }
        return new String[]{str.substring(0, length), str.substring(length)};
    }

    private static String[] splitPercentageSign(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = CONCAT_PERCENTAGE_PATTERN.matcher(str);
        while (matcher.find()) {
            if (matcher.group(1).length() > 0) {
                arrayList.add(matcher.group(1));
            }
            arrayList.add(matcher.group(2));
            String group = matcher.group(3);
            if (group.length() > 0) {
                if (!group.startsWith(HelpFormatter.DEFAULT_OPT_PREFIX) || group.length() <= 1) {
                    arrayList.add(group);
                } else {
                    arrayList.add(HelpFormatter.DEFAULT_OPT_PREFIX);
                    arrayList.add(group.substring(1));
                }
            }
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }
}
