package com.github.tjake.jlama.model.gpt2;

import com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer;
import com.google.common.collect.BiMap;
import com.google.common.collect.HashBiMap;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import net.fellbaum.jemoji.EmojiManager;

/* loaded from: input_file:com/github/tjake/jlama/model/gpt2/GPT2Tokenizer.class */
public class GPT2Tokenizer extends BPETokenizer {
    private static BiMap<Integer, String> codePointsToByteStrings;
    private static BiMap<Integer, Integer> alteredBytes = HashBiMap.create();

    public GPT2Tokenizer(Path path) {
        super(path);
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected String preProcess(String str) {
        return (String) str.codePoints().map(i -> {
            return ((Integer) alteredBytes.getOrDefault(Integer.valueOf(i), Integer.valueOf(i))).intValue();
        }).mapToObj(Character::toString).collect(Collectors.joining());
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected long encodeCharacterAsToken(byte b) {
        int unsignedInt = Byte.toUnsignedInt(b);
        Long l = (Long) this.model.vocabLookup.get(Character.toString(((Integer) alteredBytes.getOrDefault(Integer.valueOf(unsignedInt), Integer.valueOf(unsignedInt))).intValue()));
        return l == null ? r0.intValue() : l.longValue();
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected Optional<Character> maybeDecodeTokenAsCharacter(long j) {
        return Optional.empty();
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer, com.github.tjake.jlama.safetensors.tokenizer.Tokenizer
    public String decode(long j) {
        return (String) ((String) this.model.vocabLookup.inverse().get(Long.valueOf(j))).codePoints().map(i -> {
            return ((Integer) alteredBytes.inverse().getOrDefault(Integer.valueOf(i), Integer.valueOf(i))).intValue();
        }).mapToObj(Character::toString).collect(Collectors.joining());
    }

    @Override // com.github.tjake.jlama.safetensors.tokenizer.BPETokenizer
    protected String postProcess(String str) {
        for (Map.Entry entry : codePointsToByteStrings.entrySet()) {
            if (str.contains((CharSequence) entry.getValue())) {
                str = str.replace((CharSequence) entry.getValue(), Character.toString(((Integer) entry.getKey()).intValue()));
            }
        }
        return str;
    }

    static {
        int i = 0;
        for (int i2 = 0; i2 < 256; i2++) {
            if ((i2 < 33 || i2 > 126) && ((i2 < 161 || i2 > 172) && (i2 < 174 || i2 > 255))) {
                int i3 = i;
                i++;
                alteredBytes.put(Integer.valueOf(i2), Integer.valueOf(i3 + 256));
            }
        }
        codePointsToByteStrings = HashBiMap.create();
        for (int i4 = 9000; i4 <= 128512; i4++) {
            if (EmojiManager.isEmoji(Character.toString(i4))) {
                byte[] bytes = Character.toString(i4).getBytes(StandardCharsets.UTF_8);
                StringBuilder sb = new StringBuilder();
                for (byte b : bytes) {
                    sb.append(Character.toString(Byte.toUnsignedInt(b)));
                }
                codePointsToByteStrings.put(Integer.valueOf(i4), sb.toString());
            }
        }
    }
}
