# nlb.tf
# 이 파일은 03-platform 레이어 또는 별도의 04-loadbalancer 레이어에 위치
# Remote State 참조
data "terraform_remote_state" "network" {
backend = "s3"
config = {
bucket = "${var.project_name}-${var.environment}-terraform-state"
key = "network/terraform.tfstate"
region = var.region
}
}
data "terraform_remote_state" "cluster" {
backend = "s3"
config = {
bucket = "${var.project_name}-${var.environment}-terraform-state"
key = "cluster/terraform.tfstate"
region = var.region
}
}
locals {
vpc_id = data.terraform_remote_state.network.outputs.vpc_id
public_subnet_ids = data.terraform_remote_state.network.outputs.public_subnet_ids
blue_zone_subnets = data.terraform_remote_state.network.outputs.blue_zone_subnets
green_zone_subnets = data.terraform_remote_state.network.outputs.green_zone_subnets
}
# ============================================
# Network Load Balancer
# ============================================
resource "aws_lb" "shared" {
name = "${var.project_name}-${var.environment}-nlb"
internal = false
load_balancer_type = "network"
# 두 AZ에 걸쳐 서브넷 배치
subnets = local.public_subnet_ids
# Cross-zone 로드 밸런싱 비활성화 (싱글존 설계 유지)
enable_cross_zone_load_balancing = false
# 삭제 보호 (프로덕션)
enable_deletion_protection = var.environment == "prod" ? true : false
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-shared-nlb"
})
}
# ============================================
# Target Groups
# ============================================
# Blue 클러스터 타겟 그룹
resource "aws_lb_target_group" "blue" {
name = "${var.project_name}-${var.environment}-blue-tg"
port = 443
protocol = "TCP"
vpc_id = local.vpc_id
target_type = "ip"
# 헬스 체크 설정
health_check {
enabled = true
protocol = "TCP"
port = "traffic-port"
healthy_threshold = 2
unhealthy_threshold = 2
interval = 10
timeout = 5
}
# Deregistration delay (graceful shutdown)
deregistration_delay = 30
# Connection termination on deregistration
connection_termination = true
# Preserve client IP (Pod에서 클라이언트 IP 확인 가능)
preserve_client_ip = true
# Proxy Protocol v2 (선택적)
proxy_protocol_v2 = false
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-blue-tg"
Cluster = "blue"
})
lifecycle {
create_before_destroy = true
}
}
# Green 클러스터 타겟 그룹
resource "aws_lb_target_group" "green" {
name = "${var.project_name}-${var.environment}-green-tg"
port = 443
protocol = "TCP"
vpc_id = local.vpc_id
target_type = "ip"
health_check {
enabled = true
protocol = "TCP"
port = "traffic-port"
healthy_threshold = 2
unhealthy_threshold = 2
interval = 10
timeout = 5
}
deregistration_delay = 30
connection_termination = true
preserve_client_ip = true
proxy_protocol_v2 = false
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-green-tg"
Cluster = "green"
})
lifecycle {
create_before_destroy = true
}
}
# ============================================
# Listener with Weighted Target Groups
# ============================================
resource "aws_lb_listener" "https" {
load_balancer_arn = aws_lb.shared.arn
port = 443
protocol = "TCP"
default_action {
type = "forward"
forward {
# Blue 타겟 그룹 (기본 80%)
target_group {
arn = aws_lb_target_group.blue.arn
weight = var.blue_weight
}
# Green 타겟 그룹 (기본 20%)
target_group {
arn = aws_lb_target_group.green.arn
weight = var.green_weight
}
# Stickiness 설정 (선택적)
stickiness {
enabled = var.enable_stickiness
duration = 3600 # 1시간
}
}
}
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-https-listener"
})
}
# HTTP to HTTPS 리다이렉트 (선택적)
resource "aws_lb_listener" "http" {
load_balancer_arn = aws_lb.shared.arn
port = 80
protocol = "TCP"
default_action {
type = "forward"
forward {
target_group {
arn = aws_lb_target_group.blue.arn
weight = var.blue_weight
}
target_group {
arn = aws_lb_target_group.green.arn
weight = var.green_weight
}
}
}
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-http-listener"
})
}
# variables.tf (NLB 관련)
variable "blue_weight" {
description = "Traffic weight for Blue cluster (0-100)"
type = number
default = 80
validation {
condition = var.blue_weight >= 0 && var.blue_weight <= 100
error_message = "Weight must be between 0 and 100."
}
}
variable "green_weight" {
description = "Traffic weight for Green cluster (0-100)"
type = number
default = 20
validation {
condition = var.green_weight >= 0 && var.green_weight <= 100
error_message = "Weight must be between 0 and 100."
}
}
variable "enable_stickiness" {
description = "Enable session stickiness"
type = bool
default = false
}
# 가중치 합계 검증
locals {
weight_sum = var.blue_weight + var.green_weight
validate_weights = (
local.weight_sum == 100 ? true :
file("ERROR: blue_weight + green_weight must equal 100")
)
}
# environments/prod.tfvars
# 일반 운영 (Blue 80%, Green 20%)
blue_weight = 80
green_weight = 20
# 카나리 배포 (Blue 95%, Green 5%)
# blue_weight = 95
# green_weight = 5
# Green으로 전환 중 (Blue 50%, Green 50%)
# blue_weight = 50
# green_weight = 50
# Green 전환 완료 (Blue 0%, Green 100%)
# blue_weight = 0
# green_weight = 100
# Blue로 롤백 (Blue 100%, Green 0%)
# blue_weight = 100
# green_weight = 0
# 가중치 변경 적용
terraform apply -var-file="environments/prod.tfvars" -target=aws_lb_listener.https
# 또는 CLI에서 직접 지정
terraform apply -var="blue_weight=50" -var="green_weight=50"
# 타겟 등록 (수동)
# 이 방식은 Ingress Controller 없이 직접 Pod IP를 등록할 때 사용
resource "aws_lb_target_group_attachment" "blue_targets" {
for_each = toset(var.blue_target_ips)
target_group_arn = aws_lb_target_group.blue.arn
target_id = each.value
port = 443
}
resource "aws_lb_target_group_attachment" "green_targets" {
for_each = toset(var.green_target_ips)
target_group_arn = aws_lb_target_group.green.arn
target_id = each.value
port = 443
}
# outputs.tf
output "nlb_dns_name" {
description = "NLB DNS name"
value = aws_lb.shared.dns_name
}
output "nlb_zone_id" {
description = "NLB Zone ID (for Route53 alias)"
value = aws_lb.shared.zone_id
}
output "nlb_arn" {
description = "NLB ARN"
value = aws_lb.shared.arn
}
output "blue_target_group_arn" {
description = "Blue target group ARN"
value = aws_lb_target_group.blue.arn
}
output "green_target_group_arn" {
description = "Green target group ARN"
value = aws_lb_target_group.green.arn
}
output "current_weights" {
description = "Current traffic weights"
value = {
blue = var.blue_weight
green = var.green_weight
}
}
# route53.tf
# Hosted Zone 데이터 소스
data "aws_route53_zone" "main" {
name = var.domain_name
private_zone = false
}
# NLB를 가리키는 메인 레코드
resource "aws_route53_record" "api" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api.${var.domain_name}"
type = "A"
alias {
name = aws_lb.shared.dns_name
zone_id = aws_lb.shared.zone_id
evaluate_target_health = true
}
}
# Blue 클러스터 직접 접근용 (디버깅, 테스트)
resource "aws_route53_record" "api_blue" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api-blue.${var.domain_name}"
type = "A"
alias {
name = aws_lb.shared.dns_name
zone_id = aws_lb.shared.zone_id
evaluate_target_health = true
}
# 가중치 라우팅 사용 시
set_identifier = "blue"
weighted_routing_policy {
weight = var.dns_blue_weight
}
}
# Green 클러스터 직접 접근용
resource "aws_route53_record" "api_green" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api-green.${var.domain_name}"
type = "A"
alias {
name = aws_lb.shared.dns_name
zone_id = aws_lb.shared.zone_id
evaluate_target_health = true
}
set_identifier = "green"
weighted_routing_policy {
weight = var.dns_green_weight
}
}
# health-check.tf
# Blue 클러스터 헬스 체크
resource "aws_route53_health_check" "blue" {
fqdn = "api-blue.${var.domain_name}"
port = 443
type = "HTTPS"
resource_path = "/healthz"
failure_threshold = 3
request_interval = 10
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-blue-health"
Cluster = "blue"
})
}
# Green 클러스터 헬스 체크
resource "aws_route53_health_check" "green" {
fqdn = "api-green.${var.domain_name}"
port = 443
type = "HTTPS"
resource_path = "/healthz"
failure_threshold = 3
request_interval = 10
tags = merge(local.merged_tags, {
Name = "${var.project_name}-${var.environment}-green-health"
Cluster = "green"
})
}
# Failover 라우팅 (Primary: Blue, Secondary: Green)
resource "aws_route53_record" "api_failover_primary" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api-failover.${var.domain_name}"
type = "A"
alias {
name = aws_lb.shared.dns_name
zone_id = aws_lb.shared.zone_id
evaluate_target_health = true
}
set_identifier = "primary"
health_check_id = aws_route53_health_check.blue.id
failover_routing_policy {
type = "PRIMARY"
}
}
resource "aws_route53_record" "api_failover_secondary" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api-failover.${var.domain_name}"
type = "A"
alias {
name = aws_lb.shared.dns_name
zone_id = aws_lb.shared.zone_id
evaluate_target_health = true
}
set_identifier = "secondary"
health_check_id = aws_route53_health_check.green.id
failover_routing_policy {
type = "SECONDARY"
}
}
# TTL 설정이 필요한 경우 (CNAME 사용)
resource "aws_route53_record" "api_cname" {
zone_id = data.aws_route53_zone.main.zone_id
name = "api-v2.${var.domain_name}"
type = "CNAME"
ttl = var.dns_ttl # 변수로 관리
records = [aws_lb.shared.dns_name]
}
variable "dns_ttl" {
description = "DNS TTL in seconds"
type = number
default = 60 # 빠른 전환을 위해 낮은 값
}
# 데이터베이스 Pod: 특정 Zone에 고정
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: postgresql
namespace: data
spec:
serviceName: postgresql
replicas: 1
selector:
matchLabels:
app: postgresql
template:
metadata:
labels:
app: postgresql
spec:
# Zone 고정 (Blue 클러스터의 경우)
nodeSelector:
topology.kubernetes.io/zone: ap-northeast-2a
# 또는 TopologySpreadConstraints 사용
topologySpreadConstraints:
- maxSkew: 1
topologyKey: topology.kubernetes.io/zone
whenUnsatisfiable: DoNotSchedule
labelSelector:
matchLabels:
app: postgresql
containers:
- name: postgresql
image: postgres:15
ports:
- containerPort: 5432
resources:
requests:
cpu: "2"
memory: 4Gi
limits:
cpu: "4"
memory: 8Gi
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
storageClassName: gp3-zone-a # Zone-specific StorageClass
resources:
requests:
storage: 100Gi
#!/bin/bash
# runbook/manual-failover.sh
# 수동 트래픽 전환 스크립트
set -e
# 설정
PROFILE="eks-platform-prod"
REGION="ap-northeast-2"
LISTENER_ARN="arn:aws:elasticloadbalancing:ap-northeast-2:123456789012:listener/net/eks-platform-prod-nlb/abc123/def456"
BLUE_TG_ARN="arn:aws:elasticloadbalancing:ap-northeast-2:123456789012:targetgroup/eks-platform-prod-blue-tg/abc123"
GREEN_TG_ARN="arn:aws:elasticloadbalancing:ap-northeast-2:123456789012:targetgroup/eks-platform-prod-green-tg/def456"
usage() {
echo "Usage: $0 <action>"
echo ""
echo "Actions:"
echo " status - Show current traffic weights"
echo " blue - 100% to Blue cluster"
echo " green - 100% to Green cluster"
echo " canary-green - 95% Blue, 5% Green (canary)"
echo " split - 50% Blue, 50% Green"
echo " custom B G - Custom weights (B + G must equal 100)"
exit 1
}
# 현재 상태 확인
get_status() {
echo "=== Current NLB Listener Configuration ==="
aws elbv2 describe-listeners \
--listener-arns "$LISTENER_ARN" \
--profile "$PROFILE" \
--region "$REGION" \
--query 'Listeners[0].DefaultActions[0].ForwardConfig.TargetGroups' \
--output table
}
# 가중치 변경
set_weights() {
local blue_weight=$1
local green_weight=$2
echo "Setting weights: Blue=$blue_weight%, Green=$green_weight%"
aws elbv2 modify-listener \
--listener-arn "$LISTENER_ARN" \
--default-actions "[{
\"Type\": \"forward\",
\"ForwardConfig\": {
\"TargetGroups\": [
{\"TargetGroupArn\": \"$BLUE_TG_ARN\", \"Weight\": $blue_weight},
{\"TargetGroupArn\": \"$GREEN_TG_ARN\", \"Weight\": $green_weight}
]
}
}]" \
--profile "$PROFILE" \
--region "$REGION"
echo "Done! Verifying..."
get_status
}
# 타겟 헬스 확인
check_health() {
echo "=== Blue Target Group Health ==="
aws elbv2 describe-target-health \
--target-group-arn "$BLUE_TG_ARN" \
--profile "$PROFILE" \
--region "$REGION" \
--query 'TargetHealthDescriptions[*].{Target:Target.Id,Port:Target.Port,Health:TargetHealth.State}' \
--output table
echo ""
echo "=== Green Target Group Health ==="
aws elbv2 describe-target-health \
--target-group-arn "$GREEN_TG_ARN" \
--profile "$PROFILE" \
--region "$REGION" \
--query 'TargetHealthDescriptions[*].{Target:Target.Id,Port:Target.Port,Health:TargetHealth.State}' \
--output table
}
# 메인 로직
case "${1:-}" in
status)
get_status
check_health
;;
blue)
set_weights 100 0
;;
green)
set_weights 0 100
;;
canary-green)
set_weights 95 5
;;
split)
set_weights 50 50
;;
custom)
if [ -z "${2:-}" ] || [ -z "${3:-}" ]; then
echo "Error: Custom requires two weight arguments"
usage
fi
if [ $(($2 + $3)) -ne 100 ]; then
echo "Error: Weights must sum to 100"
exit 1
fi
set_weights "$2" "$3"
;;
*)
usage
;;
esac
1. CloudWatch 알람 트리거
├─ UnHealthyHostCount > 0
└─ 2회 연속 (2분)
2. SNS 알림 전송
├─ Email 알림
└─ Lambda 트리거
3. Lambda 실행
├─ 장애 클러스터 식별
├─ NLB 가중치 변경 (0/100)
└─ 추가 알림 전송
4. 복구 후 수동 조치
├─ 장애 원인 분석
├─ 클러스터 복구
└─ 가중치 복원 (80/20)
## 장애 조치 체크리스트
### 사전 확인
- [ ] 양쪽 클러스터 헬스 상태 확인
- [ ] 현재 트래픽 가중치 확인
- [ ] 연결된 서비스 영향도 파악
### 실행
- [ ] 트래픽 전환 (runbook 스크립트 사용)
- [ ] 전환 후 헬스 체크 확인
- [ ] 애플리케이션 로그 모니터링
### 사후 확인
- [ ] 에러율 확인 (CloudWatch)
- [ ] 응답 시간 확인
- [ ] 사용자 피드백 수집
### 롤백 조건
- [ ] 전환 후 5분 내 에러율 > 5%
- [ ] 응답 시간 200% 증가
- [ ] 중요 기능 장애