diff --git a/charsetter/build.gradle b/charsetter/build.gradle new file mode 100644 index 00000000..572dab71 --- /dev/null +++ b/charsetter/build.gradle @@ -0,0 +1,14 @@ +plugins { + id 'java' + id "org.moditect.gradleplugin" version "1.0.0-rc3" +} + +apply from: "$rootDir/deps/java.gradle" +apply from: "$rootDir/deps/commons.gradle" +apply from: "$rootDir/deps/junit.gradle" +apply from: "$rootDir/deps/lombok.gradle" + +configurations { + compileOnly.extendsFrom(dep) +} + diff --git a/charsetter/src/main/java/io/xpipe/charsetter/Charsettable.java b/charsetter/src/main/java/io/xpipe/charsetter/Charsettable.java new file mode 100644 index 00000000..42da3d4d --- /dev/null +++ b/charsetter/src/main/java/io/xpipe/charsetter/Charsettable.java @@ -0,0 +1,8 @@ +package io.xpipe.charsetter; + +import java.nio.charset.Charset; + +public interface Charsettable { + + Charset getCharset(); +} diff --git a/charsetter/src/main/java/io/xpipe/charsetter/Charsetter.java b/charsetter/src/main/java/io/xpipe/charsetter/Charsetter.java new file mode 100644 index 00000000..65018159 --- /dev/null +++ b/charsetter/src/main/java/io/xpipe/charsetter/Charsetter.java @@ -0,0 +1,74 @@ +package io.xpipe.charsetter; + +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; +import org.apache.commons.lang3.function.FailableBiConsumer; +import org.apache.commons.lang3.function.FailableSupplier; + +import java.io.InputStream; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.*; + +public class Charsetter { + + private static CharsetterUniverse universe; + private static final int MAX_BYTES = 8192; + + public static void init(CharsetterContext ctx) { + universe = CharsetterUniverse.create(ctx); + } + + private static void checkInit() { + if (universe == null) { + throw new IllegalStateException("Charsetter not initialized"); + } + } + + public static Charset read(FailableSupplier in, FailableBiConsumer con) throws Exception { + checkInit(); + + try (var is = in.get(); + var bin = new BOMInputStream(is)) { + ByteOrderMark bom = bin.getBOM(); + String charsetName = bom == null ? null : bom.getCharsetName(); + var charset = charsetName != null ? Charset.forName(charsetName) : null; + + if (charset == null) { + bin.mark(MAX_BYTES); + var bytes = bin.readNBytes(MAX_BYTES); + bin.reset(); + charset = inferCharset(bytes); + } + + if (con != null) { + con.accept(bin, charset); + } + return charset; + } + } + + public static Charset inferCharset(byte[] content) { + checkInit(); + + for (Charset c : universe.getCharsets()) { + CharsetDecoder decoder = c.newDecoder(); + decoder.onMalformedInput(CodingErrorAction.REPORT); + decoder.onUnmappableCharacter(CodingErrorAction.REPORT); + + ByteBuffer byteBuf = ByteBuffer.wrap(content); + CharBuffer charBuf = CharBuffer.allocate(byteBuf.capacity() * 2); + + CoderResult coderResult = decoder.decode(byteBuf, charBuf, false); + if (coderResult != null) { + if (coderResult.isError()) { + continue; + } + } + + return c; + } + + return StandardCharsets.UTF_8; + } +} diff --git a/charsetter/src/main/java/io/xpipe/charsetter/CharsetterContext.java b/charsetter/src/main/java/io/xpipe/charsetter/CharsetterContext.java new file mode 100644 index 00000000..0d6f7068 --- /dev/null +++ b/charsetter/src/main/java/io/xpipe/charsetter/CharsetterContext.java @@ -0,0 +1,20 @@ +package io.xpipe.charsetter; + +import lombok.AllArgsConstructor; +import lombok.Value; + +import java.util.List; +import java.util.Locale; + +@Value +@AllArgsConstructor +public class CharsetterContext { + + String systemCharsetName; + + Locale systemLocale; + + Locale appLocale; + + List observedCharsets; +} diff --git a/charsetter/src/main/java/io/xpipe/charsetter/CharsetterUniverse.java b/charsetter/src/main/java/io/xpipe/charsetter/CharsetterUniverse.java new file mode 100644 index 00000000..471c0b4e --- /dev/null +++ b/charsetter/src/main/java/io/xpipe/charsetter/CharsetterUniverse.java @@ -0,0 +1,32 @@ +package io.xpipe.charsetter; + +import lombok.AllArgsConstructor; +import lombok.Value; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +@Value +@AllArgsConstructor +public class CharsetterUniverse { + + List charsets; + + public static CharsetterUniverse create(CharsetterContext ctx) { + List cs = new ArrayList<>(); + + cs.add(StandardCharsets.UTF_8); + + var system = Charset.forName(ctx.getSystemCharsetName()); + cs.add(system); + + // TODO: Locales + + var observed = ctx.getObservedCharsets().stream().map(Charset::forName).toList(); + cs.addAll(observed); + + return new CharsetterUniverse(cs); + } +} diff --git a/charsetter/src/main/java/module-info.java b/charsetter/src/main/java/module-info.java new file mode 100644 index 00000000..38c83c3c --- /dev/null +++ b/charsetter/src/main/java/module-info.java @@ -0,0 +1,7 @@ +module io.xpipe.charsetter { + exports io.xpipe.charsetter; + + requires org.apache.commons.io; + requires org.apache.commons.lang3; + requires static lombok; +} \ No newline at end of file