이미지 임베딩
import tensorflow as tf
from tensorflow.keras.layers import *
import numpy as np
from sklearn.metrics.pairwise import cosine_distances
MNIST 데이터셋
(x_train, y_train), (x_test, y_test) = \
tf.keras.datasets.mnist.load_data()
데이터 짝 짓기
def make_pairs(x, y):
img1 = []
img2 = []
match = []
total = len(y)
indices = np.random.choice(total, (total, 2))
n_diff = total // 2
for k in range(total):
i, j = indices[k]
if n_diff == 0:
while y[i] != y[j]:
j = (j + 1) % total
elif y[i] != y[j]:
n_diff -= 1
img1.append(x[i])
img2.append(x[j])
match.append(y[i] == y[j])
return np.array(img1), np.array(img2), np.array(match, dtype='int')
임베딩 모델
emb_model = tf.keras.Sequential([
Rescaling(1/127.5, -1),
Reshape((28, 28, 1)),
Conv2D(32, (3, 3), activation='relu'),
MaxPool2D((2, 2)),
Conv2D(32, (3, 3), activation='relu'),
MaxPool2D((2, 2)),
Flatten(),
Dense(32, activation="tanh")
])
훈련을 위한 샴 네트워크
input1 = Input(shape=(28, 28))
emb1 = emb_model(input1)
input2 = Input(shape=(28, 28))
emb2 = emb_model(input2)
logits = Dot(axes=1)([emb1, emb2])
prediction = tf.keras.activations.sigmoid(logits)
siamese_network = tf.keras.Model(
inputs=[input1, input2],
outputs=prediction)
훈련
train_img1, train_img2, train_match = make_pairs(x_train, y_train)
siamese_network.compile(loss='binary_crossentropy', metrics=['accuracy'])
siamese_network.fit([train_img1, train_img2], train_match)
1875/1875 [==============================] - 44s 23ms/step - loss: 0.3906 - accuracy: 0.8172
<keras.src.callbacks.History at 0x25287d34760>
테스트
test_img1, test_img2, test_match = make_pairs(x_test, y_test)
siamese_network.evaluate([test_img1, test_img2], test_match)
313/313 [==============================] - 3s 9ms/step - loss: 0.3600 - accuracy: 0.8371
[0.3599715530872345, 0.8371000289916992]
x_test를 임베딩으로 변환
embs = emb_model(x_test)
289번 이미지와 다른 이미지의 코사인 거리(=1-코사인 유사도)를 구하고, 가장 거리가 가까운 이미지 10개를 고름
i = 289
dists = cosine_distances(embs[i:i+1], embs)
top10 = np.argsort(dists)[0, :10]
CPU times: total: 0 ns Wall time: 4.2 ms
가장 비슷한 이미지 10개 보기
from PIL import Image
dst = Image.new('L', (280, 28))
for k, idx in enumerate(top10):
dst.paste(Image.fromarray(x_test[idx]), (k * 28, 0))
dst
CLIP
모델 로딩
OpenAI의 CLIP 모델의 전처리를 위한 프로세서를 생성
from transformers import CLIPProcessor, CLIPModel
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
이미지 로딩
import glob
from PIL import Image
images = []
for path in glob.glob('coco/*.jpg'):
images.append(Image.open(path))
images[0]
zero-shot 분류
전처리
inputs = processor(
text=["a photo of sleeping cats", "a photo of dogs"],
images=images[0],
return_tensors="pt",
padding=True)
모델에 입력
outputs = model(**inputs)
모델 출력에서 로짓 값을 추출
outputs.logits_per_image
tensor([[29.6581, 21.5314]], grad_fn=<TBackward0>)
확률
import torch
torch.softmax(outputs.logits_per_image, dim=-1)
tensor([[9.9970e-01, 2.9545e-04]], grad_fn=<SoftmaxBackward0>)
이미지 유사도
이미지들을 전처리
inputs = processor(images=images, return_tensors="pt")
입력 이미지들의 임베딩을 추출하고 numpy 배열로 변환
embs = model.get_image_features(**inputs)
embs = embs.detach().numpy()
코사인 유사도를 계산
from sklearn.metrics.pairwise import cosine_similarity
sims = cosine_similarity(embs)
이미지 인덱스를 정렬
import numpy as np
np.argsort(sims[0])
array([4, 5, 3, 7, 9, 6, 2, 1, 8, 0], dtype=int64)
images[8]
임베딩 검색
Microsoft C++ Build Tools 다운로드: https://visualstudio.microsoft.com/visual-cpp-build-tools/
pip install chromadb
import chromadb
client = chromadb.Client()
def embedding_function(imgs):
return []
collection = client.create_collection(
name="handwriting",
embedding_function=embedding_function)
train 데이터 등록
ids = []
metadatas = []
for i in range(60000):
ids.append(f'train{i}')
metadatas.append({
'dataset': 'train',
'index': i,
'label': int(y_train[i])})
embeddings = emb_model(x_train).numpy().tolist()
collection.add(
metadatas=metadatas,
embeddings=embeddings,
ids=ids)
collection.count()
60000
i = 0
query = emb_model(x_test[i:i+1]).numpy()[0].tolist()
results = collection.query(
query_embeddings=[query],
n_results=5,
where={'$and': [
{"dataset": 'train'},
{'index': {'$lt': 10000}}
]},
)
results
{'ids': [['train7001', 'train8476', 'train6525', 'train522', 'train962']], 'distances': [[0.22980588674545288, 0.2330973744392395, 0.25129202008247375, 0.265739768743515, 0.27467530965805054]], 'metadatas': [[{'dataset': 'train', 'index': 7001, 'label': 7}, {'dataset': 'train', 'index': 8476, 'label': 7}, {'dataset': 'train', 'index': 6525, 'label': 7}, {'dataset': 'train', 'index': 522, 'label': 7}, {'dataset': 'train', 'index': 962, 'label': 7}]], 'embeddings': None, 'documents': [[None, None, None, None, None]]}
from PIL import Image
dst = Image.new('L', (28 * 5, 28))
for k, metadata in enumerate(results['metadatas'][0]):
idx = metadata['index']
img = Image.fromarray(x_train[idx])
dst.paste(img, (k * 28, 0))
dst