AthenaでCloudFrontのログを検索してみた


機会があって知ったのでとりあえずやってみたやつを備忘録しておきます。

CREATE EXTERNAL TABLE IF NOT EXISTS cf_log 
( request_date STRING, request_time STRING, x_edge_location STRING, 
sc_bytes INT, client_ip STRING, cs_method STRING, cs_host STRING,
 cs_uri_stem STRING, sc_status STRING, cs_referer STRING, user_agent STRING,
 uri_query STRING, cookie STRING, x_edge_result_type STRING, x_edge_request_id 
STRING, x_host_header STRING, cs_protocol STRING, cs_bytes INT, time_taken DECIMAL(8,3),
 x_forwarded_for STRING, ssl_protocol STRING, ssl_cipher STRING, x_edge_response_result_type STRING,
 cs_protocol_version STRING ) ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
 WITH SERDEPROPERTIES ( 'serialization.format' = '\t', 'input.regex' = '\t' )
 LOCATION 's3://<my-cflogs-backet-name>/cf-logs-data/'
 TBLPROPERTIES ('has_encrypted_data'='false');

IPを降順で出す

SELECT client_ip, count(*) as ct FROM sampledb."cf_log" group by client_ip order by ct desc limit 10;

ユーザエージェントを降順で出す

SELECT user_agent, count(*) as ct FROM sampledb."cf_log" group by user_agent order by ct desc limit 10;

ボットっぽいアクセスが多かった特定のユーザエージェントを含むデータのサブセットを作ってそこからIPをカウントして出す

with
dataset as (
  SELECT client_ip FROM sampledb."cf_log" 
where user_agent like 'Mozilla/5.0 (compatible; MJ12bot%'
)
select client_ip,count(*) as ct from dataset group by client_ip order by ct desc limit 10;

最近はCloudTrailのログとかもポチっと押すとテーブル作ってくれて証跡がクエリで終えたりするようですね。
(CloudTrailの画面にリンクがでている)
https://docs.aws.amazon.com/ja_jp/athena/latest/ug/cloudtrail-logs.html
https://dev.classmethod.jp/cloud/aws/new-feature-amazon-athena-create-aws-cloudtrail-table/

いちおう、cloudtrailのログテーブルこんな感じでつくるっぽい

CREATE EXTERNAL TABLE Cloudtrail_Logs (
eventversion STRING,
userIdentity STRUCT<
type:STRING,
principalid:STRING,
arn:STRING,
accountid:STRING,
invokedby:STRING,
accesskeyid:STRING,
userName:String,
sessioncontext:STRUCT<
attributes:STRUCT<
mfaauthenticated:STRING,
creationdate:STRING>,
sessionIssuer:STRUCT<
type:STRING,
principalId:STRING,
arn:STRING,
accountId:STRING,
userName:STRING>>>,
eventTime STRING,
eventSource STRING,
eventName STRING,
awsRegion STRING,
sourceIpAddress STRING,
userAgent STRING,
errorCode STRING,
errorMessage STRING,
requestId STRING,
eventId STRING,
resources ARRAY<STRUCT<
ARN:STRING,
accountId:STRING,
type:STRING>>,
eventType STRING,
apiVersion STRING,
readOnly BOOLEAN,
recipientAccountId STRING,
sharedEventID STRING,
vpcEndpointId STRING
)
ROW FORMAT SERDE 'com.amazon.emr.hive.serde.CloudTrailSerde'
STORED AS INPUTFORMAT 'com.amazon.emr.cloudtrail.CloudTrailInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION 's3://<my-bucket>/AWSLogs/AWS_ACCOUNT_NUMBER/CloudTrail/<my-region>/';

とくていのIPからエラー吐いてるっぽいの探すくえり

select eventname, sourceipaddress, userIdentity.arn, eventsource, errorcode, errormessage
from sampledb.cloudtrail_logs
where
sourceipaddress in ('xxx.xxx.xxx.147', 'xxx.xxx.xxx.25')
select eventname, sourceipaddress, userIdentity.arn, eventsource, errorcode, errormessage
from sampledb.cloudtrail_logs
where errorCode <> '' and
sourceipaddress in ('xxx.xxx.xxx.147', 'xxx.xxx.xxx.25')

CloudwatchでTrailのイベント通知さすやつ
https://docs.aws.amazon.com/ja_jp/AmazonCloudWatch/latest/events/Create-CloudWatch-Events-CloudTrail-Rule.html
(Lambdaに渡してコード書いて環境内にいるやつならフォレンジック環境になるSGあてる処理をとかなんとかする的な。。難しいですね。)

S3バケット自体のイベント通知をなんとかするやつ
https://docs.aws.amazon.com/ja_jp/AmazonS3/latest/user-guide/enable-event-notifications.html