package edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer;

import edu.northwestern.at.utils.PatternReplacer;

/* loaded from: input_file:edu/northwestern/at/morphadorner/corpuslinguistics/tokenizer/EccoPreTokenizer.class */
public class EccoPreTokenizer extends AbstractPreTokenizer implements PreTokenizer {
    protected static final String EccoAlwaysSeparators = "((-{2,})|(\\.{3,})|[\\(\\)\\[\\]\";:/=`¶<>¡¿«»“”¦❘[\\p{InGeneralPunctuation}&&[^\\{\\}\\|•′″‴‘’‐‑…⁂†‡§]]\\p{InLetterlikeSymbols}\\p{InMathematicalOperators}\\p{InMiscellaneousTechnical}[\\p{InGeometricShapes}&&[^●◊]]\\p{InMiscellaneousSymbols}\\p{InDingbats}\\p{InAlphabeticPresentationForms}])";
    protected static final PatternReplacer wordOrSpanGapReplacer = new PatternReplacer("(〈[◊|…]+〉)", " $1 ");
    protected static final PatternReplacer doubleBackTicksReplacer = new PatternReplacer("(``)", " $1 ");
    protected static final PatternReplacer singleBackTicksReplacer = new PatternReplacer("`([A-Z])", "` $1");

    public EccoPreTokenizer() {
        alwaysSeparatorsReplacer = new PatternReplacer(EccoAlwaysSeparators, " $1 ");
    }

    @Override // edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.AbstractPreTokenizer, edu.northwestern.at.morphadorner.corpuslinguistics.tokenizer.PreTokenizer
    public String pretokenize(String str) {
        return singleBackTicksReplacer.replace(doubleBackTicksReplacer.replace(wordOrSpanGapReplacer.replace(super.pretokenize(str)).replaceAll("(\\s|\\.|\\?|!)—", "$1 —").replaceAll("([\\p{L}\\-0-9\\'‑●]{3,})—", "$1 —").replaceAll("—([\\p{L}\\-0-9\\'‑●]{3,})", "— $1").replaceAll("([0-9]+)—", "$1 —")));
    }
}
