AvroSerializer.java

// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

package com.azure.data.schemaregistry.apacheavro;

import com.azure.core.util.logging.ClientLogger;
import com.azure.core.util.serializer.TypeReference;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericContainer;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.io.BinaryEncoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.io.DecoderFactory;
import org.apache.avro.io.EncoderFactory;
import org.apache.avro.message.BinaryMessageDecoder;
import org.apache.avro.specific.SpecificData;
import org.apache.avro.specific.SpecificDatumReader;
import org.apache.avro.specific.SpecificDatumWriter;
import org.apache.avro.specific.SpecificRecord;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;

/**
 * Class containing implementation of Apache Avro serializer
 */
class AvroSerializer {
    private static final Map<Class<?>, Schema> PRIMITIVE_SCHEMAS;
    private static final Schema NULL_SCHEMA = Schema.create(Schema.Type.NULL);
    private static final int V1_HEADER_LENGTH = 10;
    private static final byte[] V1_HEADER = new byte[]{-61, 1};

    private final ClientLogger logger = new ClientLogger(AvroSerializer.class);
    private final boolean avroSpecificReader;
    private final Schema.Parser parser;
    private final EncoderFactory encoderFactory;
    private final DecoderFactory decoderFactory;

    static {
        final HashMap<Class<?>, Schema> schemas = new HashMap<>();

        final Schema booleanSchema = Schema.create(Schema.Type.BOOLEAN);
        schemas.put(Boolean.class, booleanSchema);
        schemas.put(boolean.class, booleanSchema);

        final Schema intSchema = Schema.create(Schema.Type.INT);
        schemas.put(Integer.class, intSchema);
        schemas.put(int.class, intSchema);

        final Schema longSchema = Schema.create(Schema.Type.LONG);
        schemas.put(Long.class, longSchema);
        schemas.put(long.class, longSchema);

        final Schema floatSchema = Schema.create(Schema.Type.FLOAT);
        schemas.put(Float.class, floatSchema);
        schemas.put(float.class, floatSchema);

        final Schema doubleSchema = Schema.create(Schema.Type.DOUBLE);
        schemas.put(Double.class, doubleSchema);
        schemas.put(double.class, doubleSchema);

        final Schema byteSchema = Schema.create(Schema.Type.BYTES);
        schemas.put(byte.class, byteSchema);
        schemas.put(Byte.class, byteSchema);
        schemas.put(byte[].class, byteSchema);
        schemas.put(Byte[].class, byteSchema);

        // This class is abstract but not final.
        schemas.put(ByteBuffer.class, byteSchema);

        final Schema stringSchema = Schema.create(Schema.Type.STRING);
        schemas.put(String.class, stringSchema);

        PRIMITIVE_SCHEMAS = Collections.unmodifiableMap(schemas);
    }

    /**
     * Instantiates AvroCodec instance
     *
     * @param avroSpecificReader flag indicating if decoder should decode records as {@link SpecificRecord
     *     SpecificRecords}.
     * @param parser Schema parser to use.
     * @param encoderFactory Encoder factory
     * @param decoderFactory Decoder factory
     */
    AvroSerializer(boolean avroSpecificReader, Schema.Parser parser, EncoderFactory encoderFactory,
        DecoderFactory decoderFactory) {

        this.avroSpecificReader = avroSpecificReader;
        this.parser = Objects.requireNonNull(parser, "'parser' cannot be null.");
        this.encoderFactory = Objects.requireNonNull(encoderFactory, "'encoderFactory' cannot be null.");
        this.decoderFactory = Objects.requireNonNull(decoderFactory, "'decoderFactory' cannot be null.");
    }

    /**
     * @param schemaString string representation of schema
     *
     * @return avro schema
     */
    Schema parseSchemaString(String schemaString) {
        return this.parser.parse(schemaString);
    }

    /**
     * Returns A byte[] containing Avro encoding of object parameter.
     *
     * @param object Object to be encoded into byte stream
     *
     * @return A set of bytes that represent the object.
     *
     * @throws IllegalArgumentException If the object is not a serializable type.
     * @throws IllegalStateException if the object could not be serialized to an object stream or there was a
     *     runtime exception during serialization.
     */
    <T> byte[] encode(T object) {
        final Schema schema = getSchema(object);

        try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
            if (object instanceof byte[]) {
                // todo: real avro byte arrays require writing array size to buffer
                outputStream.write((byte[]) object);
            } else {
                BinaryEncoder encoder = encoderFactory.directBinaryEncoder(outputStream, null);
                DatumWriter<T> writer;
                if (object instanceof SpecificRecord) {
                    writer = new SpecificDatumWriter<>(schema);
                } else {
                    writer = new GenericDatumWriter<>(schema);
                }
                writer.write(object, encoder);
                encoder.flush();
            }
            return outputStream.toByteArray();
        } catch (IOException | RuntimeException e) {
            // Avro serialization can throw AvroRuntimeException, NullPointerException, ClassCastException, etc
            throw logger.logExceptionAsError(new IllegalStateException("Error serializing Avro message", e));
        }
    }

    /**
     * @param bytes byte array containing encoded bytes
     * @param schemaBytes schema content for Avro reader to read - fetched from Azure Schema Registry
     *
     * @return deserialized object
     */
    <T> T decode(byte[] bytes, byte[] schemaBytes, TypeReference<T> typeReference) {
        Objects.requireNonNull(bytes, "'bytes' must not be null.");
        Objects.requireNonNull(schemaBytes, "'schemaBytes' must not be null.");

        final String schemaString = new String(schemaBytes, StandardCharsets.UTF_8);
        final Schema schemaObject = parseSchemaString(schemaString);

        if (isSingleObjectEncoded(bytes)) {
            final BinaryMessageDecoder<T> messageDecoder = new BinaryMessageDecoder<>(SpecificData.get(), schemaObject);

            try {
                return messageDecoder.decode(bytes);
            } catch (IOException e) {
                throw logger.logExceptionAsError(new UncheckedIOException(
                    "Unable to deserialize Avro schema object using binary message decoder.", e));
            }
        } else {
            final DatumReader<T> reader = getDatumReader(schemaObject, typeReference);

            try {
                return reader.read(null, decoderFactory.binaryDecoder(bytes, null));
            } catch (IOException | RuntimeException e) {
                throw logger.logExceptionAsError(new IllegalStateException("Error deserializing raw Avro message.", e));
            }
        }
    }

    /**
     * Returns Avro schema for specified object, including null values
     *
     * @param object object for which Avro schema is being returned
     *
     * @return Avro schema for object's data structure
     *
     * @throws IllegalArgumentException if object type is unsupported.
     */
    static Schema getSchema(Object object) {
        if (object instanceof GenericContainer) {
            return ((GenericContainer) object).getSchema();
        }

        if (object == null) {
            return NULL_SCHEMA;
        }

        final Class<?> objectClass = object.getClass();
        final Schema primitiveSchema = getPrimitiveSchema(objectClass);
        if (primitiveSchema != null) {
            return primitiveSchema;
        } else {
            throw new IllegalArgumentException("Unsupported Avro type. Supported types are null, GenericContainer,"
                + " Boolean, Integer, Long, Float, Double, String, Byte[], Byte, ByteBuffer, and their primitive"
                + " equivalents. Actual: " + objectClass);
        }
    }

    /**
     * True if the object has the single object payload header. The header is comprised of:
     * <ul>
     *     <li>2 byte marker, C3 01</li>
     *     <li>8 byte little-endian CRC-64-AVRO fingerprint of the object's schema</li>
     * </ul>
     *
     * @param schemaBytes Bytes to read from.
     *
     * @return true if the object has the single object payload header; false otherwise.
     *
     * @see <a href="https://avro.apache.org/docs/current/spec.html#single_object_encoding">Single Object Encoding</a>
     */
    static boolean isSingleObjectEncoded(byte[] schemaBytes) {
        if (schemaBytes.length < V1_HEADER_LENGTH) {
            return false;
        }

        return V1_HEADER[0] == schemaBytes[0] && V1_HEADER[1] == schemaBytes[1];
    }

    /**
     * Gets the type's schema if there is one.
     *
     * @param clazz Class to get schema.
     * @param <T> The type of object.
     *
     * @return The {@link Schema} or {@code null} if it was not a GenericContainer, could not instantiate the type, or
     *     there was no default constructor.
     */
    <T> Schema getSchemaFromTypeReference(Class<T> clazz) {
        if (!GenericContainer.class.isAssignableFrom(clazz)) {
            return null;
        }

        final Optional<Constructor<?>> defaultConstructor;
        try {
            defaultConstructor = Arrays.stream(clazz.getDeclaredConstructors())
                .filter(constructor -> constructor.getParameterCount() == 0)
                .findFirst();
        } catch (SecurityException e) {
            logger.info("Could not get declaring constructors for deserializing T ({}). Using writer schema.",
                clazz, e);
            return null;
        }

        if (!defaultConstructor.isPresent()) {
            return null;
        }

        Object instance = null;
        try {
            instance = defaultConstructor.get().newInstance();
        } catch (InstantiationException | IllegalAccessException | InvocationTargetException e) {
            logger.info("Could not create new instance for deserializing T ({}). Using writer schema.", clazz, e);
        }

        return instance instanceof GenericContainer
            ? ((GenericContainer) instance).getSchema()
            : null;
    }

    /**
     * Gets a schema for the given class if it is an Avro primitive type.
     *
     * @param clazz Object class
     *
     * @return Matching primitive schema, otherwise {@code null} if it is not.
     */
    private static Schema getPrimitiveSchema(Class<?> clazz) {
        final Schema schema = PRIMITIVE_SCHEMAS.get(clazz);
        if (schema != null) {
            return schema;
        } else if (CharSequence.class.isAssignableFrom(clazz)) {
            return PRIMITIVE_SCHEMAS.get(String.class);
        } else if (ByteBuffer.class.isAssignableFrom(clazz)) {
            return PRIMITIVE_SCHEMAS.get(Byte[].class);
        } else {
            return null;
        }
    }

    /**
     * Returns correct reader for decoding payload.
     *
     * @param writerSchema Avro schema fetched from schema registry store
     *
     * @return correct Avro DatumReader object given encoder configuration
     */
    private <T> DatumReader<T> getDatumReader(Schema writerSchema, TypeReference<T> typeReference) {
        final Class<T> clazz = typeReference.getJavaClass();
        final Schema primitiveSchema = getPrimitiveSchema(clazz);

        if (primitiveSchema != null) {
            if (avroSpecificReader) {
                return new SpecificDatumReader<>(writerSchema);
            } else {
                return new GenericDatumReader<>(writerSchema);
            }
        }

        final Schema readerSchema = getSchemaFromTypeReference(clazz);
        if (readerSchema != null && !readerSchema.equals(writerSchema)) {
            logger.verbose("The writer schema is different than reader schema. Using reader schema. "
                + "Writer: '{}'. Reader: '{}'", writerSchema, readerSchema);

            return new SpecificDatumReader<>(writerSchema, readerSchema);
        }

        if (SpecificRecord.class.isAssignableFrom(clazz)) {
            return new SpecificDatumReader<>(writerSchema);
        } else {
            return new GenericDatumReader<>(writerSchema);
        }
    }
}