我正在尝试使用BERT嵌入进行文本分类。我正在使用此代码创建用于二进制分类的BERT嵌入层和密集层。

# Initialize session
sess = tf.Session()

class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_module = hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [tokenization_info["vocab_file"], tokenization_info["do_lower_case"]]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)


def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label


def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )


def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples


class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="mean",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)


# Build model
def build_model(max_seq_length):
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation="relu")(bert_output)
    pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)

    # embedding_size = 768
    # bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
    # # Reshape bert_output before passing it the GRU
    # bert_output_ = tf.keras.layers.Reshape((max_seq_length, embedding_size))(bert_output)

    # gru_out = tf.keras.layers.GRU(100, activation='sigmoid')(bert_output_)
    # dense = tf.keras.layers.Dense(256, activation="relu")(gru_out)
    # pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)

    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.summary()

    return model


def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)


def main():
    # Params for bert model and tokenization
    bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
    max_seq_length = 256

    train_df, test_df = master_df[:round(len(master_df)*.8)], master_df[round(len(master_df)*.8):]


    # Create datasets (Only take up to max_seq_length words for memory)
    train_text = train_df["words"].tolist()
    train_text = [" ".join(t.split()[0:max_seq_length]) for t in train_text]
    train_text = np.array(train_text, dtype=object)[:, np.newaxis]
    train_label = train_df["new_grouping"].tolist()

    test_text = test_df["words"].tolist()
    test_text = [" ".join(t.split()[0:max_seq_length]) for t in test_text]
    test_text = np.array(test_text, dtype=object)[:, np.newaxis]
    test_label = test_df["new_grouping"].tolist()

    # Instantiate tokenizer
    tokenizer = create_tokenizer_from_hub_module(bert_path)

    # Convert data to InputExample format
    train_examples = convert_text_to_examples(train_text, train_label)
    test_examples = convert_text_to_examples(test_text, test_label)

    # Convert to features
    (
        train_input_ids,
        train_input_masks,
        train_segment_ids,
        train_labels,
    ) = convert_examples_to_features(
        tokenizer, train_examples, max_seq_length=max_seq_length
    )
    (
        test_input_ids,
        test_input_masks,
        test_segment_ids,
        test_labels,
    ) = convert_examples_to_features(
        tokenizer, test_examples, max_seq_length=max_seq_length
    )

    model = build_model(max_seq_length)

    # Instantiate variables
    initialize_vars(sess)

    checkpoint_path = "bert_dir/cp.ckpt"
    checkpoint_dir = os.path.dirname('checkpoint_path')

    # Create a callback that saves the model's weights
    cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                    save_weights_only=True,
                                                    verbose=1)
    history = model.fit(
        [train_input_ids, train_input_masks, train_segment_ids],
        train_labels,
        validation_data=(
            [test_input_ids, test_input_masks, test_segment_ids],
            test_labels,
        ),
        epochs=1,
        batch_size=32,
        callbacks=[cp_callback]
    )

    model.save('bert_1.h5')


    return history

if __name__ == "__main__":
   history = main()


代码来自这里:https://github.com/strongio/keras-bert/blob/master/keras-bert.py

我要做的是更改模型架构。具体来说,我想尝试添加LSTM或BiLSTM层并退出。模型代码在这里:

def build_model(max_seq_length):
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]

    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
    dense = tf.keras.layers.Dense(256, activation="relu")(bert_output)
    pred = tf.keras.layers.Dense(1, activation="sigmoid")(dense)

    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    model.summary()

    return model


这篇文章提出了类似的问题:

add LSTM/GRU to BERT embeddings in keras tensorflow

但是,帖子中的解决方案对我不起作用。该帖子建议这样做:

embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids")
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")

bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs)
bert_output = Reshape((max_seq_length, embedding_size))(bert_output)

bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
output = Dense(output_size, activation="softmax")(bilstm)


但是我得到了错误:


  ValueError:传递了形状为(9300,1)的目标数组
  输出形状(无,256、1),同时用作损失binary_crossentropy


编辑1

当我尝试使用下面的Il.SQ建议的代码时,出现此错误:



---------------------------------------------------------------------------
ResourceExhaustedError                    Traceback (most recent call last)
<ipython-input-19-d270f0ba6b07> in <module>()
    372
    373 if __name__ == "__main__":
--> 374    history, train_df, val_df = main()

5 frames
<ipython-input-19-d270f0ba6b07> in main()
    363         epochs=1,
    364         batch_size=32,
--> 365         callbacks=[cp_callback]
    366     )
    367

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, max_queue_size, workers, use_multiprocessing, **kwargs)
    725         max_queue_size=max_queue_size,
    726         workers=workers,
--> 727         use_multiprocessing=use_multiprocessing)
    728
    729   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py in fit(self, model, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_freq, **kwargs)
    673         validation_steps=validation_steps,
    674         validation_freq=validation_freq,
--> 675         steps_name='steps_per_epoch')
    676
    677   def evaluate(self,

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/engine/training_arrays.py in model_iteration(model, inputs, targets, sample_weights, batch_size, epochs, verbose, callbacks, val_inputs, val_targets, val_sample_weights, shuffle, initial_epoch, steps_per_epoch, validation_steps, validation_freq, mode, validation_in_fit, prepared_feed_values_from_dataset, steps_name, **kwargs)
    392
    393         # Get outputs.
--> 394         batch_outs = f(ins_batch)
    395         if not isinstance(batch_outs, list):
    396           batch_outs = [batch_outs]

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/keras/backend.py in __call__(self, inputs)
   3474
   3475     fetched = self._callable_fn(*array_vals,
-> 3476                                 run_metadata=self.run_metadata)
   3477     self._call_fetch_callbacks(fetched[-len(self._fetches):])
   3478     output_structure = nest.pack_sequence_as(

/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in __call__(self, *args, **kwargs)
   1470         ret = tf_session.TF_SessionRunCallable(self._session._session,
   1471                                                self._handle, args,
-> 1472                                                run_metadata_ptr)
   1473         if run_metadata:
   1474           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted: OOM when allocating tensor with shape[32,12,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node bert_layer_6/bert_layer_6_module_apply_tokens/bert/encoder/layer_9/attention/self/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[loss_2/mul/_8343]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

  (1) Resource exhausted: OOM when allocating tensor with shape[32,12,256,256] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node bert_layer_6/bert_layer_6_module_apply_tokens/bert/encoder/layer_9/attention/self/Softmax}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

0 successful operations.
0 derived errors ignored.





编辑2

为了尝试解决来自Edit 1的上述错误,我将batchsize减小为2。但是,现在有一个新错误:



---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-9-3c92390d2c23> in <module>()
    372
    373 if __name__ == "__main__":
--> 374    history, train_df, val_df = main()

5 frames
/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/client/session.py in __call__(self, *args, **kwargs)
   1470         ret = tf_session.TF_SessionRunCallable(self._session._session,
   1471                                                self._handle, args,
-> 1472                                                run_metadata_ptr)
   1473         if run_metadata:
   1474           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument: Input to reshape is a tensor with 1536 values, but the requested shape has 393216
	 [[{{node reshape_2/Reshape}}]]
	 [[loss/mul/_3739]]
  (1) Invalid argument: Input to reshape is a tensor with 1536 values, but the requested shape has 393216
	 [[{{node reshape_2/Reshape}}]]
0 successful operations.
0 derived errors ignored.





编辑3

这是工作密集模型的模型摘要:



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to
==================================================================================================
input_ids (InputLayer)          [(None, 256)]        0
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0
__________________________________________________________________________________________________
bert_layer_2 (BertLayer)        (None, 768)          110104890   input_ids[0][0]
                                                                 input_masks[0][0]
                                                                 segment_ids[0][0]
__________________________________________________________________________________________________
dense_4 (Dense)                 (None, 256)          196864      bert_layer_2[0][0]
__________________________________________________________________________________________________
dense_5 (Dense)                 (None, 1)            257         dense_4[0][0]
==================================================================================================
Total params: 110,302,011
Trainable params: 21,460,737
Non-trainable params: 88,841,274
__________________________________________________________________________________________________
Train on 5632 samples, validate on 1408 samples
5628/5632 [============================>.] - ETA: 0s - loss: 0.4927 - acc: 0.8220





这是不起作用的建议LSTM模型的模型摘要:



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to
==================================================================================================
input_ids (InputLayer)          [(None, 256)]        0
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0
__________________________________________________________________________________________________
bert_layer_2 (BertLayer)        (None, 768)          110104890   input_ids[0][0]
                                                                 input_masks[0][0]
                                                                 segment_ids[0][0]
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 256, 768)     0           bert_layer_2[0][0]
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 256, 256)     918528      reshape_2[0][0]
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 256)          0           bidirectional_1[0][0]
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1)            257         global_max_pooling1d_1[0][0]
==================================================================================================
Total params: 111,023,675
Trainable params: 85,973,249
Non-trainable params: 25,050,426
__________________________________________________________________________________________________

最佳答案

首先,减小批量大小。

然后更改为:
这将添加一个全局最大池化1d层进行展平。

embedding_size = 768
in_id = Input(shape=(max_seq_length,), name="input_ids")
in_mask = Input(shape=(max_seq_length,), name="input_masks")
in_segment = Input(shape=(max_seq_length,), name="segment_ids")

bert_inputs = [in_id, in_mask, in_segment]
bert_output = BertLayer(n_fine_tune_layers=12, pooling="mean")(bert_inputs)
bert_output = Reshape((embedding_size,1))(bert_output)

bilstm = Bidirectional(LSTM(128, dropout=0.2,recurrent_dropout=0.2,return_sequences=True))(bert_output)
pool=GlobalMaxPooling1D()(bilstm)
output = Dense(output_size, activation="softmax")(pool)


如果不起作用,则您的火车输入可能无效。

10-08 07:56