albert classification error(Failed copying input tensor from GPU in order to run Identity: GPU sync failed [Op:Identity])

> #tokenization define
from bert.tokenization.albert_tokenization import FullTokenizer
def createTokenizer():
    return FullTokenizer("../albert_base/assets/30k-clean.vocab", spm_model_file="../albert_base/assets/30k-clean.model", do_lower_case=True)

> def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        #Cutting down the excess length
        tokens = tokens[0:max_seq_length]
        return [1]*len(tokens)
    else :
      return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

> def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
      #Cutting down the excess length
      tokens = tokens[:max_seq_length]
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments
    else:
      segments = []
      current_segment_id = 0
      for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
          current_segment_id = 1
      return segments + [0] * (max_seq_length - len(tokens))

> def get_ids(tokens, tokenizer, max_seq_length):
    if len(tokens)>max_seq_length:
      tokens = tokens[:max_seq_length]
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      return token_ids
    else:
      token_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
      return input_ids

> tokenizer = createTokenizer()
max_seq_length = 64  #This number will determine the number of tokens

> def prep(s, get = 'id'):
    stokens = tokenizer.tokenize(s)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    if get == 'id':
        input_ids = get_ids(stokens, tokenizer, max_seq_length)
        return input_ids
    elif get == 'mask':
        input_masks = get_masks(stokens, max_seq_length)
        return input_masks
    else:
        input_segments = get_segments(stokens, max_seq_length)
        return input_segments

> #train and test data load
import pandas as pd
train_set = pd.read_csv("../goemotion/train_set.csv")
test_set = pd.read_csv("../goemotion/test_set.csv")
train_X = [prep(sentence) for sentence in train_set["text"]]
train_Y = list(map(int, train_set["emotion"].tolist()))
test_X = [prep(sentence) for sentence in test_set["text"]]
test_Y = list(map(int, test_set["emotion"].tolist()))
print("data preprocess finished")

> #albert model calling
import os
import bert
import tensorflow as tf

> #GPU config
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices("GPU")[0], True)

> #parameters
model_name = "albert_base_v2"
model_ckpt = os.path.join("../albert_base", "model.ckpt-best")
model_params = bert.albert_params("../albert_base/")

> #call and define model layers
albert_layer = bert.BertModelLayer.from_params(model_params, name="albert")
model_layer = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(max_seq_length,), dtype="int32", name="input_ids"),
    albert_layer,
    tf.keras.layers.Dense(112, activation=tf.nn.relu),
    tf.keras.layers.Dense(27, activation=tf.nn.softmax),#0~27
    tf.keras.layers.Dense(1, activation=tf.nn.softmax)
])
model_layer.build(input_shape=(None, max_seq_length))
bert.load_albert_weights(albert_layer, model_ckpt)

> #compile
model_layer.compile(loss="sparse_categorical_crossentropy", optimizer=tf.optimizers.Adam(lr=0.00001), metrics=["sparse_categorical_accuracy"])
print(model_layer.summary())

> #train start
checkpointName = os.path.join("../albert_base/models/", "albert_faq.ckpt")
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpointName,
                                                save_weights_only=True,
                                                verbose=1)

> #train_start
history = model_layer.fit(
            test_X,
            test_Y,
            epochs=300,
            validation_data=(train_X, train_Y),
            verbose=1,
            callbacks=[cp_callback],
            batch_size=2)

above is my code and
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04    Driver Version: 455.23.04    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  GeForce RTX 3090    On   | 00000000:09:00.0  On |                  N/A |
| 33%   53C    P2   111W / 350W |   1016MiB / 24265MiB |      1%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
this is nvidia-smi

I use tensorflow-gpu 2.2 and cuda toolkit 10.1 and cudnn 7.6
My computer is 3900X 128GB(RAM) RTX3090 500GB(SSD)

and if run above code error message is below.


  File "/home/sentiment/anaconda3/envs/mybert/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 6606, in raise_from_not_ok_status
    six.raise_from(core._status_to_exception(e.code, message), None)
  File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

I want to train albert finetuning.
if i use tensorflow for cpu. it work fine but 1 epoch per 6 hour for training.
so I hope to use gpu

I really hard to find out solution for fixing but failed.

is there anyone know how to fix this error?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

albert classification error(Failed copying input tensor from GPU in order to run Identity: GPU sync failed [Op:Identity]) #83

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

albert classification error(Failed copying input tensor from GPU in order to run Identity: GPU sync failed [Op:Identity]) #83

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions