Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 138 additions & 0 deletions WEEK02/practice/eng_word2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# 1. 영어 워드 임베딩 구축
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input
from tensorflow.keras.layers import Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG
import gensim

# 다운로드한 데이터가 캐시에 저장되어 있는지 확인
from sklearn.datasets import get_data_home
print(get_data_home())


print("데이터 다운로드 중...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
dataset = dataset.data

news_df = pd.DataFrame({'document':dataset})
news_df

# 1-1. 데이터 전처리
# 결측치 확인
print("데이터 전처리 중...")
news_df.replace("", float("NaN"), inplace=True)
news_df = news_df.dropna().reset_index(drop=True)
print(f"필터링된 데이터셋 총 개수 : {len(news_df)}")
# >> 11096

# 중복제거
print("중복제거 중...")
processed_news_df = news_df.drop_duplicates(['document']).reset_index(drop=True)
processed_news_df

processed_news_df['document'] = processed_news_df['document'].apply(lambda x: x.replace("[^a-zA-Z]", " "))
processed_news_df['document'] = processed_news_df['document'].apply(lambda x: ' '.join([token for token in x.split() if len(token) > 2]))
processed_news_df = processed_news_df[processed_news_df.document.apply(lambda x: len(str(x)) <= 200 and len(str(x).split()) > 5)].reset_index(drop=True)
processed_news_df['document'] = processed_news_df['document'].apply(lambda x: x.lower())
processed_news_df


nltk.download('stopwords')

stop_words = stopwords.words('english')

tokenized_doc = processed_news_df['document'].apply(lambda x: x.split())
tokenized_doc = tokenized_doc.apply(lambda x: [s_word for s_word in x if s_word not in stop_words])
tokenized_doc

# 1-2. 단어 토큰화
print("단어 토큰화 중...")
drop_train = [index for index, sentence in enumerate(tokenized_doc) if len(sentence) <= 1]
tokenized_doc = np.delete(tokenized_doc, drop_train, axis=0)

print(len(tokenized_doc))
# >> 2235

tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)

word2idx = tokenizer.word_index
idx2word = {value : key for key, value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)

vocab_size = len(word2idx) + 1
print("어휘 사전 크기:", vocab_size)

# 1-3. negative sampling
print("negative sampling 중...")

training_dataset = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10)
for sample in encoded[:1000]]

# ✅ 첫 번째 샘플에서 couples, labels 꺼내오기
couples, labels = training_dataset[0]

# ✅ 예시 출력 5개
for i in range(5):
print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
idx2word[couples[i][0]], couples[i][0],
idx2word[couples[i][1]], couples[i][1],
labels[i]
))


# 1-4. Skip-gram with Negative Sampling
print("Skip-gram 중...")

embedding_dim = 100

# 중심 단어를 위한 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype='int32')
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어를 위한 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype='int32')
context_embedding = Embedding(vocab_size, embedding_dim)(c_inputs)

dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1,), input_shape=(1, 1))(dot_product)
output = Activation('sigmoid')(dot_product)

model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam')
plot_model(model, to_file='model3.png', show_shapes=True, show_layer_names=True, rankdir='TB')

for epoch in range(10):
loss = 0
for _, elem in enumerate(training_dataset):
first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
labels = np.array(elem[1], dtype='int32')
X = [first_elem, second_elem]
Y = labels
loss += model.train_on_batch(X,Y)
print('Epoch :', epoch + 1, 'Loss :', loss)

# 1-5. 임베딩 품질 확인
print("임베딩 품질 확인 중...")

f = open('practice/vectors.txt' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

# 모델 로드
w2v = gensim.models.KeyedVectors.load_word2vec_format('./practice/vectors.txt', binary=False)

w2v.most_similar(positive=['apple'])
153 changes: 153 additions & 0 deletions WEEK02/practice/kor_word2vec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# 2. 한국어 워드 임베딩 구축 및 시각화
# JDK 11 필요

import urllib.request
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jpype
from konlpy.tag import Okt
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib import rc
import matplotlib.font_manager as fm
import os

# 설치된 폰트 확인
for f in fm.findSystemFonts(fontpaths=None, fontext='ttf'):
if "Nanum" in f or "NotoSans" in f:
rc('font', family='NanumBarunGothic')
break


# 2-1. 데이터 수집
print("데이터 수집 중...")

urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt", filename="practice/ratings_train.txt")
urllib.request.urlretrieve("https://raw.githubusercontent.com/e9t/nsmc/master/ratings_test.txt", filename="practice/ratings_test.txt")

# ✅ 데이터 로드
train_dataset = pd.read_table("ratings_train.txt")
test_dataset = pd.read_table("ratings_test.txt")

# 2-2. 데이터 전처리
print("데이터 전처리 중...")

# 결측치 처리
train_dataset.replace("", float("NaN"), inplace=True)
train_dataset = train_dataset.dropna().reset_index(drop=True)

# 중복 제거
train_dataset = train_dataset.drop_duplicates(['document']).reset_index(drop=True)

# 한글이 아닌 문자 제거
train_dataset['document'] = train_dataset['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","", regex=True)

# 길이가 짧은 데이터 제거
train_dataset['document'] = train_dataset['document'].apply(
lambda x: ' '.join([token for token in x.split() if len(token) > 2])
)

# 전체 길이가 10 이하이거나 전체 단어 개수가 5개 이하인 데이터 제거
train_dataset = train_dataset[train_dataset.document.apply(
lambda x: len(str(x)) > 10 and len(str(x).split()) > 5
)].reset_index(drop=True)

# 불용어 정의
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를',
'으로','자','에','와','한','하다']

# 2-2. 토큰화
print("형태소 분석기 토큰화 중...")

okt = Okt()
tokenized_data = []

for sentence in train_dataset['document']:
tokenized_sentence = okt.morphs(sentence, stem=True) # 토큰화
stopwords_removed = [word for word in tokenized_sentence if word not in stopwords] # 불용어 제거
tokenized_data.append(stopwords_removed)

# 2-3. 데이터 분포 확인
print("데이터 분포 확인 중...")

print('리뷰의 최대 길이 :', max(len(review) for review in tokenized_data))
print('리뷰의 평균 길이 :', sum(map(len, tokenized_data))/len(tokenized_data))

plt.hist([len(review) for review in tokenized_data], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

# 2-4. 워드 임베딩 구축
print("워드 임베딩 구축 중...")

embedding_dim = 100

model = Word2Vec(
sentences=tokenized_data,
vector_size=embedding_dim,
window=5,
min_count=5,
workers=4,
sg=0 # CBOW
)

word_vectors = model.wv
vocabs = list(word_vectors.key_to_index.keys())

# 유사도 확인
print("👉 '마블'과 유사한 단어:")
for sim_word in model.wv.most_similar("마블"):
print(sim_word)

print("👉 '슬픔' vs '눈물' 유사도:", model.wv.similarity('슬픔', '눈물'))

# 2-5. PCA를 이용한 시각화
print("PCA 시각화 중...")

word_vector_list = [word_vectors[word] for word in vocabs]

pca = PCA(n_components=2)
xys = pca.fit_transform(word_vector_list)

x_axis = xys[:, 0]
y_axis = xys[:, 1]

def plot_pca_graph(vocabs, x_axis, y_axis):
plt.figure(figsize=(25, 15))
plt.scatter(x_axis, y_axis, marker='o')
for i, v in enumerate(vocabs):
plt.annotate(v, xy=(x_axis[i], y_axis[i]))
plt.show()

plot_pca_graph(vocabs, x_axis, y_axis)

# 2-6. t-SNE를 이용한 시각화
print("t-SNE 시각화 중...")

tsne = TSNE(learning_rate=100, n_iter=1000, perplexity=30)
word_vector_list = np.array(word_vector_list)
transformed = tsne.fit_transform(word_vector_list)

x_axis_tsne = transformed[:, 0]
y_axis_tsne = transformed[:, 1]

def plot_tsne_graph(vocabs, x_axis, y_axis):
plt.figure(figsize=(30, 30))
plt.scatter(x_axis, y_axis, marker='o')
for i, v in enumerate(vocabs):
plt.annotate(v, xy=(x_axis[i], y_axis[i]))
plt.show()

plot_tsne_graph(vocabs, x_axis_tsne, y_axis_tsne)

# 2-7. 임베딩 프로젝터용 저장

print("임베딩 프로젝터용 파일 저장 중...")

model.wv.save_word2vec_format('sample_word2vec_embedding')

# gensim 내장 스크립트 실행하면 텐서보드에서 Embedding Projector로 확인 가능
# !python -m gensim.scripts.word2vec2tensor --input sample_word2vec_embedding --output sample_word2vec_embedding
Loading