diff --git a/mediapipe/tasks/cc/components/processors/proto/BUILD b/mediapipe/tasks/cc/components/processors/proto/BUILD index 82d4ea21b..55cf3fca1 100644 --- a/mediapipe/tasks/cc/components/processors/proto/BUILD +++ b/mediapipe/tasks/cc/components/processors/proto/BUILD @@ -93,3 +93,14 @@ mediapipe_proto_library( "//mediapipe/framework:calculator_proto", ], ) + +mediapipe_proto_library( + name = "transformer_params_proto", + srcs = ["transformer_params.proto"], +) + +mediapipe_proto_library( + name = "llm_params_proto", + srcs = ["llm_params.proto"], + deps = [":transformer_params_proto"], +) diff --git a/mediapipe/tasks/cc/components/processors/proto/llm_params.proto b/mediapipe/tasks/cc/components/processors/proto/llm_params.proto new file mode 100644 index 000000000..b0c253598 --- /dev/null +++ b/mediapipe/tasks/cc/components/processors/proto/llm_params.proto @@ -0,0 +1,41 @@ +/* Copyright 2023 The MediaPipe Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +syntax = "proto3"; + +package mediapipe.tasks.components.processors.proto; + +import "mediapipe/tasks/cc/components/processors/proto/transformer_params.proto"; + +option java_package = "com.google.mediapipe.tasks.components.processors.proto"; +option java_outer_classname = "LLMParametersProto"; + +// Parameters for Large Language Models (LLM). +message LLMParameters { + TransformerParameters transformer_parameters = 1; + + // Size of vocabulary. + int32 vocab_size = 2; + + // Whether or not to disable KV cache, which is also referred as state + // somewhere else. + bool disable_kv_cache = 3; + + // Id of the start token. + int32 start_token_id = 4; + + // Token to determine the end of output stream. + string stop_token = 5; +} diff --git a/mediapipe/tasks/cc/components/processors/proto/transformer_params.proto b/mediapipe/tasks/cc/components/processors/proto/transformer_params.proto new file mode 100644 index 000000000..a04aa9571 --- /dev/null +++ b/mediapipe/tasks/cc/components/processors/proto/transformer_params.proto @@ -0,0 +1,64 @@ +/* Copyright 2023 The MediaPipe Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +syntax = "proto3"; + +package mediapipe.tasks.components.processors.proto; + +option java_package = "com.google.mediapipe.tasks.components.processors.proto"; +option java_outer_classname = "TransformerParametersProto"; + +// The parameters of transformer (https://arxiv.org/pdf/1706.03762.pdf) +message TransformerParameters { + // Batch size of tensors. + int32 batch_size = 1; + + // Maximum sequence length of the input/output tensor. + int32 max_seq_length = 2; + + // Embedding dimension (or model dimension), `d_model` in the paper. + // `d_k` == `d_v` == `d_model`/`h`. + int32 embedding_dim = 3; + + // Hidden dimension used in the feedforward layer, `d_ff` in the paper. + int32 hidden_dimension = 4; + + // Head dimension, `d_k` or `d_v` in the paper. + int32 head_dimension = 5; + + // Number of heads, `h` in the paper. + int32 num_heads = 6; + + // Number of stacked transformers, `N` in the paper. + int32 num_stacks = 7; + + // Deprecated: bool use_mqa. Use num_kv_heads below. + reserved 8; + + // Number of kv heads. 0 means Multi-Head-Attention (MHA), key and value have + // same number of heads as query; 1 means Multi-Query-Attention (MQA), key and + // value have one head; otherwise, this specifies the number of heads for key + // and value, and Grouped-Query-Attention (GQA) will be used. See + // https://arxiv.org/pdf/2305.13245.pdf for details. + int32 num_kv_heads = 9; + + // Different types of attention mask type. + enum AttentionMaskType { + UNSPECIFIED = 0; + CAUSAL = 1; + PREFIX = 2; + } + AttentionMaskType attention_mask_type = 10; +}