이미지 임베딩 :: 컴퓨터 비전 - mindscale
Skip to content

이미지 임베딩

import tensorflow as tf
from tensorflow.keras.layers import *
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

MNIST 데이터셋

(x_train, y_train), (x_test, y_test) = \
    tf.keras.datasets.mnist.load_data()

데이터 짝 짓기

def make_pairs(x, y):
    img1 = []
    img2 = []
    match = []
    total = len(y)
    indices = np.random.choice(total, (total, 2))
    n_diff = total // 2
    for k in range(total):
        i, j = indices[k]
        if n_diff == 0:
            while y[i] != y[j]:
                j = (j + 1) % total
        elif y[i] != y[j]:
            n_diff -= 1
        img1.append(x[i])
        img2.append(x[j])
        match.append(y[i] == y[j])
    return np.array(img1), np.array(img2), np.array(match, dtype='int')

임베딩 모델

emb_model = tf.keras.Sequential([
    Rescaling(1/127.5, -1),
    Reshape((28, 28, 1)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPool2D((2, 2)),
    Conv2D(32, (3, 3), activation='relu'),
    MaxPool2D((2, 2)),
    Flatten(),
    Dense(32, activation="tanh")
])

훈련을 위한 샴 네트워크

input1 = Input(shape=(28, 28))
emb1 = emb_model(input1)
input2 = Input(shape=(28, 28))
emb2 = emb_model(input2)

logits = Dot(axes=1)([emb1, emb2])
prediction = tf.keras.activations.sigmoid(logits)

siamese_network = tf.keras.Model(
    inputs=[input1, input2], 
    outputs=prediction)

훈련

train_img1, train_img2, train_match = make_pairs(x_train, y_train)
siamese_network.compile(loss='binary_crossentropy', metrics=['accuracy'])
siamese_network.fit([train_img1, train_img2], train_match)
1875/1875 [==============================] - 44s 23ms/step - loss: 0.3906 - accuracy: 0.8172
<keras.src.callbacks.History at 0x25287d34760>

테스트

test_img1, test_img2, test_match = make_pairs(x_test, y_test)
siamese_network.evaluate([test_img1, test_img2], test_match)
313/313 [==============================] - 3s 9ms/step - loss: 0.3600 - accuracy: 0.8371
[0.3599715530872345, 0.8371000289916992]

x_test를 임베딩으로 변환

embs = emb_model(x_test)

289번 이미지와 다른 이미지의 코사인 거리(=1-코사인 유사도)를 구하고, 가장 거리가 가까운 이미지 10개를 고름

i = 289
dists = cosine_distances(embs[i:i+1], embs)
top10 = np.argsort(dists)[0, :10]
CPU times: total: 0 ns
Wall time: 4.2 ms

가장 비슷한 이미지 10개 보기

from PIL import Image

dst = Image.new('L', (280, 28))
for k, idx in enumerate(top10):
    dst.paste(Image.fromarray(x_test[idx]), (k * 28, 0))
dst

CLIP

모델 로딩

OpenAI의 CLIP 모델의 전처리를 위한 프로세서를 생성

from transformers import CLIPProcessor, CLIPModel
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

이미지 로딩

import glob
from PIL import Image

images = []
for path in glob.glob('coco/*.jpg'):
    images.append(Image.open(path))
images[0]

zero-shot 분류

전처리

inputs = processor(
    text=["a photo of sleeping cats", "a photo of dogs"],
    images=images[0],
    return_tensors="pt",
    padding=True)

모델에 입력

outputs = model(**inputs)

모델 출력에서 로짓 값을 추출

outputs.logits_per_image
tensor([[29.6581, 21.5314]], grad_fn=<TBackward0>)

확률

import torch
torch.softmax(outputs.logits_per_image, dim=-1)
tensor([[9.9970e-01, 2.9545e-04]], grad_fn=<SoftmaxBackward0>)

이미지 유사도

이미지들을 전처리

inputs = processor(images=images, return_tensors="pt")

입력 이미지들의 임베딩을 추출하고 numpy 배열로 변환

embs = model.get_image_features(**inputs)
embs = embs.detach().numpy()

코사인 유사도를 계산

from sklearn.metrics.pairwise import cosine_similarity
sims = cosine_similarity(embs)

이미지 인덱스를 정렬

import numpy as np
np.argsort(sims[0])
array([4, 5, 3, 7, 9, 6, 2, 1, 8, 0], dtype=int64)
images[8]

임베딩 검색

Microsoft C++ Build Tools 다운로드: https://visualstudio.microsoft.com/visual-cpp-build-tools/

pip install chromadb
import chromadb
client = chromadb.Client()
def embedding_function(imgs):
    return []
collection = client.create_collection(
    name="handwriting",
    embedding_function=embedding_function)

train 데이터 등록

ids = []
metadatas = []
for i in range(60000):
    ids.append(f'train{i}')
    metadatas.append({
        'dataset': 'train',
        'index': i,
        'label': int(y_train[i])})
embeddings = emb_model(x_train).numpy().tolist()
collection.add(
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids)
collection.count()
60000
i = 0
query = emb_model(x_test[i:i+1]).numpy()[0].tolist()
results = collection.query(
    query_embeddings=[query],
    n_results=5,
    where={'$and': [
        {"dataset": 'train'}, 
        {'index': {'$lt': 10000}}
    ]},
)
results
{'ids': [['train7001', 'train8476', 'train6525', 'train522', 'train962']],
 'distances': [[0.22980588674545288,
   0.2330973744392395,
   0.25129202008247375,
   0.265739768743515,
   0.27467530965805054]],
 'metadatas': [[{'dataset': 'train', 'index': 7001, 'label': 7},
   {'dataset': 'train', 'index': 8476, 'label': 7},
   {'dataset': 'train', 'index': 6525, 'label': 7},
   {'dataset': 'train', 'index': 522, 'label': 7},
   {'dataset': 'train', 'index': 962, 'label': 7}]],
 'embeddings': None,
 'documents': [[None, None, None, None, None]]}
from PIL import Image

dst = Image.new('L', (28 * 5, 28))
for k, metadata in enumerate(results['metadatas'][0]):
    idx = metadata['index']
    img = Image.fromarray(x_train[idx])
    dst.paste(img, (k * 28, 0))
dst