Skip to content

albert classification error(Failed copying input tensor from GPU in order to run Identity: GPU sync failed [Op:Identity]) #83

@DrinkingMilktea

Description

@DrinkingMilktea

#tokenization define
from bert.tokenization.albert_tokenization import FullTokenizer
def createTokenizer():
return FullTokenizer("../albert_base/assets/30k-clean.vocab", spm_model_file="../albert_base/assets/30k-clean.model", do_lower_case=True)

def get_masks(tokens, max_seq_length):
"""Mask for padding"""
if len(tokens)>max_seq_length:
#Cutting down the excess length
tokens = tokens[0:max_seq_length]
return [1]*len(tokens)
else :
return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

def get_segments(tokens, max_seq_length):
if len(tokens)>max_seq_length:
#Cutting down the excess length
tokens = tokens[:max_seq_length]
segments = []
current_segment_id = 0
for token in tokens:
segments.append(current_segment_id)
if token == "[SEP]":
current_segment_id = 1
return segments
else:
segments = []
current_segment_id = 0
for token in tokens:
segments.append(current_segment_id)
if token == "[SEP]":
current_segment_id = 1
return segments + [0] * (max_seq_length - len(tokens))

def get_ids(tokens, tokenizer, max_seq_length):
if len(tokens)>max_seq_length:
tokens = tokens[:max_seq_length]
token_ids = tokenizer.convert_tokens_to_ids(tokens)
return token_ids
else:
token_ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
return input_ids

tokenizer = createTokenizer()
max_seq_length = 64 #This number will determine the number of tokens

def prep(s, get = 'id'):
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]
if get == 'id':
input_ids = get_ids(stokens, tokenizer, max_seq_length)
return input_ids
elif get == 'mask':
input_masks = get_masks(stokens, max_seq_length)
return input_masks
else:
input_segments = get_segments(stokens, max_seq_length)
return input_segments

#train and test data load
import pandas as pd
train_set = pd.read_csv("../goemotion/train_set.csv")
test_set = pd.read_csv("../goemotion/test_set.csv")
train_X = [prep(sentence) for sentence in train_set["text"]]
train_Y = list(map(int, train_set["emotion"].tolist()))
test_X = [prep(sentence) for sentence in test_set["text"]]
test_Y = list(map(int, test_set["emotion"].tolist()))
print("data preprocess finished")

#albert model calling
import os
import bert
import tensorflow as tf

#GPU config
tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices("GPU")[0], True)

#parameters
model_name = "albert_base_v2"
model_ckpt = os.path.join("../albert_base", "model.ckpt-best")
model_params = bert.albert_params("../albert_base/")

#call and define model layers
albert_layer = bert.BertModelLayer.from_params(model_params, name="albert")
model_layer = tf.keras.Sequential([
tf.keras.layers.Input(shape=(max_seq_length,), dtype="int32", name="input_ids"),
albert_layer,
tf.keras.layers.Dense(112, activation=tf.nn.relu),
tf.keras.layers.Dense(27, activation=tf.nn.softmax),#0~27
tf.keras.layers.Dense(1, activation=tf.nn.softmax)
])
model_layer.build(input_shape=(None, max_seq_length))
bert.load_albert_weights(albert_layer, model_ckpt)

#compile
model_layer.compile(loss="sparse_categorical_crossentropy", optimizer=tf.optimizers.Adam(lr=0.00001), metrics=["sparse_categorical_accuracy"])
print(model_layer.summary())

#train start
checkpointName = os.path.join("../albert_base/models/", "albert_faq.ckpt")
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpointName,
save_weights_only=True,
verbose=1)

#train_start
history = model_layer.fit(
test_X,
test_Y,
epochs=300,
validation_data=(train_X, train_Y),
verbose=1,
callbacks=[cp_callback],
batch_size=2)

above is my code and
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.04 Driver Version: 455.23.04 CUDA Version: 11.1 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|===============================+======================+======================|
| 0 GeForce RTX 3090 On | 00000000:09:00.0 On | N/A |
| 33% 53C P2 111W / 350W | 1016MiB / 24265MiB | 1% Default |
| | | N/A |
+-------------------------------+----------------------+----------------------+
this is nvidia-smi

I use tensorflow-gpu 2.2 and cuda toolkit 10.1 and cudnn 7.6
My computer is 3900X 128GB(RAM) RTX3090 500GB(SSD)

and if run above code error message is below.

File "/home/sentiment/anaconda3/envs/mybert/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 6606, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "", line 3, in raise_from
tensorflow.python.framework.errors_impl.InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:GPU:0 to /job:localhost/replica:0/task:0/device:CPU:0 in order to run Identity: GPU sync failed [Op:Identity]

I want to train albert finetuning.
if i use tensorflow for cpu. it work fine but 1 epoch per 6 hour for training.
so I hope to use gpu

I really hard to find out solution for fixing but failed.

is there anyone know how to fix this error?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions