BahdanauAttention 정리하기

인공지능/공부

BahdanauAttention 정리하기

이게될까 2024. 4. 26. 22:23

728x90

일단 원 코드!

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')


    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, state = self.gru(x)
        output = tf.reshape(output, (-1, output.shape[2]))
        x = self.fc(output)
        return x, state, attention_weights

차원!

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,  # 출력 차원: [batch_sz, sequence_length, enc_units]
                                   return_state=True,  # 상태 차원: [batch_sz, enc_units]
                                   recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)  # 임베딩 후 차원: [batch_sz, sequence_length, embedding_dim]
        output, state = self.gru(x, initial_state=hidden)  # 출력 차원: [batch_sz, sequence_length, enc_units], 상태 차원: [batch_sz, enc_units]
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))  # 초기 상태 차원: [batch_sz, enc_units]
    
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)  # 차원 변경: [batch_sz, 1, dec_units]
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))  # 스코어 차원: [batch_sz, sequence_length, 1]
        attention_weights = tf.nn.softmax(score, axis=1)  # 주의 가중치 차원: [batch_sz, sequence_length, 1]
        context_vector = attention_weights * values  # 컨텍스트 벡터 차원: [batch_sz, sequence_length, enc_units]
        context_vector = tf.reduce_sum(context_vector, axis=1)  # 합산 후 차원: [batch_sz, enc_units]

        return context_vector, attention_weights
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,  # 출력 차원: [batch_sz, sequence_length, dec_units]
                                       return_state=True,  # 상태 차원: [batch_sz, dec_units]
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)  # 컨텍스트 벡터 차원: [batch_sz, enc_units]
        x = self.embedding(x)  # 임베딩 후 차원: [batch_sz, sequence_length, embedding_dim]
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)  # 연결 후 차원: [batch_sz, sequence_length, enc_units + embedding_dim]
        output, state = self.gru(x)  # 출력 차원: [batch_sz, sequence_length, dec_units], 상태 차원: [batch_sz, dec_units]
        output = tf.reshape(output, (-1, output.shape[2]))  # reshape 후 차원: [batch_sz * sequence_length, dec_units]
        x = self.fc(output)  # 최종 출력 차원: [batch_sz * sequence_length, vocab_size]
        return x, state, attention_weights

Bahdanau Attention과 seq2seq 모델의 기본 attention은 둘 다 신경망의 가중치를 초기화하는 방법입니다. 이 두 방법의 주요 차이점은 사용되는 활성화 함수와 관련이 있습니다.

seq2seq attention은 가중치를 초기화하는 방식으로, 데이터가 분산을 유지하면서 흘러가도록 합니다. 이 방법은 활성화 함수가 선형(예: Sigmoid, Tanh)일 때 잘 작동합니다. 그러나 ReLU와 같은 비선형 활성화 함수를 사용할 때는 문제가 발생할 수 있습니다. ReLU는 음수 부분에서 활성화되지 않기 때문에, 출력의 분산이 절반으로 줄어들어, 출력이 0이 되는 현상이 발생할 수 있습니다.

이 문제를 해결하기 위해 Bahdanau Attention이 제안되었습니다. Bahdanau Attention은 ReLU 활성화 함수를 사용할 때 더 잘 작동합니다. Bahdanau Attention은 ReLU에서 출력의 분산이 절반으로 줄어드는 것을 고려하여 가중치의 분산을 두 배로 늘립니다. 즉, seq2seq attention이 가중치의 분산을 1/n으로 설정한다면, Bahdanau Attention은 가중치의 분산을 2/n으로 설정합니다.

따라서, Bahdanau Attention과 seq2seq attention 중 어떤 것을 사용할지 결정할 때는 주로 사용하는 활성화 함수를 고려해야 합니다.

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz  # 배치 크기 저장
        self.enc_units = enc_units  # 인코더의 유닛 수 저장
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)  # 임베딩 레이어 생성
        self.gru = tf.keras.layers.GRU(self.enc_units,  # GRU 레이어 생성
                                       return_sequences=True,  # 모든 타임 스텝의 출력을 반환
                                       return_state=True,  # 마지막 상태를 반환
                                       recurrent_initializer='glorot_uniform')  # 가중치 초기화 방법 지정

    def call(self, x, hidden):
        x = self.embedding(x)  # 입력 데이터를 임베딩
        output, state = self.gru(x, initial_state=hidden)  # GRU 레이어를 통과시킨 후 출력과 상태 반환
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))  # 초기 상태를 영벡터로 설정

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)  # 가중치 W1
        self.W2 = tf.keras.layers.Dense(units)  # 가중치 W2
        self.V = tf.keras.layers.Dense(1)  # 가중치 V

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)  # 쿼리 차원 증가
        score = self.V(tf.nn.tanh(  # 점수 계산
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)  # 주의 가중치 계산
        context_vector = attention_weights * values  # 컨텍스트 벡터 계산
        context_vector = tf.reduce_sum(context_vector, axis=1)  # 시간 축에 따라 합산

        return context_vector, attention_weights
    
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz  # 배치 크기 저장
        self.dec_units = dec_units  # 디코더의 유닛 수 저장
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)  # 임베딩 레이어 생성
        self.gru = tf.keras.layers.GRU(self.dec_units,  # GRU 레이어 생성
                                       return_sequences=True,  # 모든 타임 스텝의 출력을 반환
                                       return_state=True,  # 마지막 상태를 반환
                                       recurrent_initializer='glorot_uniform')  # 가중치 초기화 방법 지정
        self.fc = tf.keras.layers.Dense(vocab_size)  # 최종 출력을 위한 Fully Connected 레이어
        self.attention = BahdanauAttention(self.dec_units)  # Bahdanau 주의 메커니즘 사용

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)  # 주의 메커니즘 적용
        x = self.embedding(x)  # 입력 데이터를 임베딩
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)  # 컨텍스트 벡터와 임베딩 결과를 연결
        output, state = self.gru(x)  # GRU 레이어를 통과
        output = tf.reshape(output, (-1, output.shape[2]))  # 출력 형태 조정
        x = self.fc(output)  # 최종 출력 계산
        return x, state, attention_weights  # 출력, 상태, 주의 가중치 반환

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)  # 쿼리 벡터를 변환하는 밀집층(Dense layer) W1
        self.W2 = tf.keras.layers.Dense(units)  # 값(인코더 출력)을 변환하는 밀집층 W2, 이는 키 벡터 역할도 함
        self.V = tf.keras.layers.Dense(1)  # 변환된 쿼리와 키의 상호작용을 평가하는 스칼라 점수를 계산하는 밀집층 V

    def call(self, query, values):
        # 쿼리(디코더의 현재 히든 스테이트)에 시간 축을 추가하여 차원을 맞춤
        query_with_time_axis = tf.expand_dims(query, 1)  # 쿼리 차원을 [배치 크기, 1, 특성 수]로 변경

        # 쿼리와 키(값)를 각각 W1, W2로 변환 후 합산하여 탄젠트 활성화 함수 적용
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))  # 점수 계산: 변환된 쿼리와 키의 요소별 합

        # 계산된 점수를 사용하여 softmax를 적용, 각 인코더 출력의 주의 가중치를 계산
        attention_weights = tf.nn.softmax(score, axis=1)  # 주의 가중치: softmax를 통해 정규화

        # 주의 가중치와 값(인코더 출력, 여기서는 'value')을 곱하여 컨텍스트 벡터를 계산
        context_vector = attention_weights * values  # 각 값에 대한 가중 평균을 계산

        # 시간 축(axis=1)을 따라 주의 가중치를 적용한 값들을 합산하여 최종 컨텍스트 벡터를 생성
        context_vector = tf.reduce_sum(context_vector, axis=1)  # 최종 컨텍스트 벡터: 가중치가 적용된 합

        return context_vector, attention_weights

결국

인코더 - 임베딩 후 GRU

attention
score=tanh(q*w1 + k*w2)*v
attention_weight = softmax(score)
context_vector = sum(score* attention_weight)

디코더 - 히든 스테이트와 인코더 아웃풋을 어텐션 -> 입력데이터를 임베딩하여 어텐션 결과와 concat ->GRU와 FCN통과하여 출력

저작자표시 (새창열림)

'인공지능 > 공부' 카테고리의 다른 글

인공지능과 빅데이터 9주차 2차시 - 데이터 수집, 관리, 분석 (0)	2024.04.29
인공지능과 빅데이터 9주차 1차시 - 데이터 사이언스 개요, 활용 분야 (0)	2024.04.29
모두를 위한 머신러닝 중간고사 (0)	2024.04.25
딥러닝 개론 중간고사 (0)	2024.04.25
딥러닝개론 중간고사 예측 (0)	2024.04.24

현재글BahdanauAttention 정리하기

인공지능, 자율주행에 관심있는 공대생의 일기장...?

Today :
Yesterday :

일	월	화	수	목	금	토
1	2	3	4	5	6	7
8	9	10	11	12	13	14
15	16	17	18	19	20	21
22	23	24	25	26	27	28
29	30

공대생 도전 일지