Tuesday, July 4, 2017

How to fetch WWDC 2017/2018/2019 Video Subtitle to SRT format

Create and Run this script wwdc_fetch_srt.sh to fetch WWDC2019 subtitle
Reference : https://github.com/wsvn53/wwdc2016-subtitles

wwdc_fetch_srt.sh    Select all
#!/bin/sh # @Author: Ethan # @Date: 2016-06-22 14:10:53 # @Last Modified by: javacom # @Last Modified time: 2019-06-06 WWDC_YEAR=2019; # change to 2017/2018 and also works for WWDC2017 or WWDC2018 WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR; WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX); detect_video_m3u8 () { local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/; local session_html=$(curl -s $session_url); local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#"); echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do local mp4_filename=$(basename $mp4_url | cut -d. -f1); local srt_filename=$mp4_filename.srt; echo "> Subtitle local: $WWDC_LOCAL_DIR/$srt_filename" >&2; > $WWDC_LOCAL_DIR/$srt_filename; done echo "$video_url"; echo "> Video: $video_url" >&2; } detect_subtitle_m3u8 () { local video_url=$1; local subtitle_uri=$(curl -s $video_url | grep "LANGUAGE=\"eng\"" | sed "s#.*URI=\"\(.*\)\"#\1#"); local subtitle_url=$subtitle_uri; [[ "$subtitle_uri" != http* ]] && { subtitle_url=$(dirname $video_url)/$subtitle_uri; } echo "$subtitle_url"; echo "> Subtitle: $subtitle_url" >&2; } download_subtitle_contents () { local subtitle_url=$1; echo "> Downloading... " local subtitle_base_url=$(dirname $subtitle_url); curl -s $subtitle_url | grep "webvtt" | while read webvtt; do local subtitle_webvtt=$subtitle_base_url/$webvtt; #echo "- get $subtitle_webvtt"; local subtitle_content=$(curl -s $subtitle_webvtt); ls $WWDC_LOCAL_DIR/"$SESSION_ID"_* | while read srt_file; do echo "$subtitle_content" >> $srt_file; done done } main () { [ ! -d $WWDC_LOCAL_DIR ] && { mkdir $WWDC_LOCAL_DIR; } curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3\}\).*#\1#" | sort | uniq | while read SESSION_ID; do #echo "SESSION_ID is" $SESSION_ID local video_url=$(detect_video_m3u8 $SESSION_ID); local subtitle_url=$(detect_subtitle_m3u8 $video_url); download_subtitle_contents $subtitle_url; done } main;




Run this shell script to format as SRT subtitle

shellscript.sh    Select all
WWDC_YEAR=2019; # change to 2017/2018 and also works for WWDC2017 or WWDC2018 cd wwdc$WWDC_YEAR mkdir -p sd mkdir -p hd for i in ???_sd_*.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/&gt;/>/g;s/&lt;/</g;1,2d;' > sd/$i; done for i in ???_hd_*.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/&gt;/>/g;s/&lt;/</g;1,2d;' > hd/$i; done




Run this script wwdc_fetch_mp4.sh to download all mp4 (HD and SD) videos

wwdc_fetch_mp4.sh    Select all
#!/bin/sh # @Last Modified by: javacom # @Last Modified time: 2019-06-06 WWDC_YEAR=2019; # change to 2017/2018 and also works for WWDC2017 or WWDC2018 WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR; WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX); download_mp4_video () { local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/; local session_html=$(curl -s $session_url); local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#"); echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do local mp4_filename=$(basename $mp4_url); if [ -e $WWDC_LOCAL_DIR/$mp4_filename ] then echo "> MP4 already existed : $WWDC_LOCAL_DIR/$mp4_filename" >&2; echo "> To resume broken download use curl -C - --connect-timeout 1200 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url" >&2; echo " " >&2; else echo "> MP4 Downloading... : $mp4_url" >&2; curl --connect-timeout 120 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url fi done } main () { [ ! -d $WWDC_LOCAL_DIR ] && { mkdir $WWDC_LOCAL_DIR; } curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3\}\).*#\1#" | sort | uniq | while read SESSION_ID; do download_mp4_video $SESSION_ID; done } main;


One liner version wwdc2019_fetch_mp4.sh to download all mp4 videos

wwdc2019_fetch_mp4.sh    Select all
# one liner for hd videos download # change to 2017/2018 and also works for WWDC2017 or WWDC2018 WWDCYEAR="wwdc2019"; for i in `curl -s https://developer.apple.com/videos/$WWDCYEAR/ | grep -o '<a href="/videos/play/'"$WWDCYEAR"'/[0-9]*' | cut -d '"' -f2 | sort | uniq`; do video_url=$(curl -s https://developer.apple.com${i} | grep -o 'http.*_hd_.*.mp4'); if [ ! -z "$video_url" ]; then mp4_filename=$(basename $video_url); if [ -e $mp4_filename ]; then echo "skipping $mp4_filename"; else echo "Downloading ... $mp4_filename";curl --connect-timeout 120 -O $video_url; fi; fi; done # one liner for sd videos download WWDCYEAR="wwdc2019"; for i in `curl -s https://developer.apple.com/videos/$WWDCYEAR/ | grep -o '<a href="/videos/play/'"$WWDCYEAR"'/[0-9]*' | cut -d '"' -f2 | sort | uniq`; do video_url=$(curl -s https://developer.apple.com${i} | grep -o 'http.*_sd_.*.mp4'); if [ ! -z "$video_url" ]; then mp4_filename=$(basename $video_url); if [ -e $mp4_filename ]; then echo "skipping $mp4_filename"; else echo "Downloading ... $mp4_filename";curl -O $video_url; fi; fi; done




No comments: