Showing posts with label WWDC2020. Show all posts
Showing posts with label WWDC2020. Show all posts

Saturday, October 17, 2020

How to download WWDC2020 videos and subtitles

The scripts for previous WWDC are here https://iphonesdkdev.blogspot.com/2017/07/how-to-fetch-wwdc-2017-video-subtitle.html

Create and Run this script wwdc2020_fetch_srt.sh to fetch WWDC2020 subtitle


wwdc2020_fetch_en.srt.sh    Select all
#!/bin/bash # @Last Modified by: javacom # @Last Modified time: 2020-10-17 WWDC_YEAR=2020; # WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR; WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX); detect_video_m3u8 () { local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/; local session_html=$(curl -s $session_url); local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#"); echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do local mp4_filename=$(basename $mp4_url | cut -d. -f1); local srt_filename=$mp4_filename.en.srt; # local srt_filename=$mp4_filename.zh.srt; echo "> Subtitle local: $WWDC_LOCAL_DIR/$srt_filename" >&2; > $WWDC_LOCAL_DIR/$srt_filename; done echo "$video_url"; echo "> Video: $video_url" >&2; } detect_subtitle_m3u8 () { local video_url=$1; # en subtitle local subtitle_uri=$(curl -s $video_url | grep "LANGUAGE=\"en\",URI" | sed "s#.*URI=\"\(.*\)\"#\1#"); # zh subtitle #local subtitle_uri=$(curl -s $video_url | grep "LANGUAGE=\"zh\"" | sed "s#.*URI=\"\(.*\)\"#\1#"); local subtitle_url=$subtitle_uri; [[ "$subtitle_uri" != http* ]] && { subtitle_url=$(dirname $video_url)/$subtitle_uri; } echo "$subtitle_url"; echo "> Subtitle: $subtitle_url" >&2; } download_subtitle_contents () { local subtitle_url=$1; echo "> Downloading... " local subtitle_base_url=$(dirname $subtitle_url); curl -s $subtitle_url | grep "webvtt" | while read webvtt; do local subtitle_webvtt=$subtitle_base_url/$webvtt; #echo "- get $subtitle_webvtt"; local subtitle_content=$(curl -s $subtitle_webvtt); # en subtitle ls $WWDC_LOCAL_DIR/"wwdc$WWDC_YEAR"_"$SESSION_ID"*.en.srt | while read srt_file; do # zh subtitle # ls $WWDC_LOCAL_DIR/"wwdc$WWDC_YEAR"_"$SESSION_ID"*.zh.srt | while read srt_file; do echo "$subtitle_content" >> $srt_file; done done } main () { [ ! -d $WWDC_LOCAL_DIR ] && { mkdir $WWDC_LOCAL_DIR; } #Year 2020 change {3\} to {3,5\} curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3,5\}\).*#\1#" | sort | uniq | while read SESSION_ID; do #echo "SESSION_ID is" $SESSION_ID local video_url=$(detect_video_m3u8 $SESSION_ID); local subtitle_url=$(detect_subtitle_m3u8 $video_url); download_subtitle_contents $subtitle_url; done } main;




Run this shell script to format as SRT subtitle

shellscript.sh    Select all
WWDC_YEAR=2020; # cd wwdc$WWDC_YEAR mkdir -p sd mkdir -p hd for i in *_sd.??.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;s/align.middle line.*$//;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/&gt;/>/g;s/&lt;/</g;1,2d;' > sd/$i; done for i in *_hd.??.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;s/align.middle line.*$//;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/&gt;/>/g;s/&lt;/</g;1,2d;' > hd/$i; done




Run this script wwdc2020_fetch_mp4.sh to download all mp4 (HD and SD) videos (also works for 2021)

wwdc2020_fetch_mp4.sh    Select all
#!/bin/bash # @Last Modified by: javacom # @Last Modified time: 2020-10-17 WWDC_YEAR=2020; # change to 2021 also works for WWDC2021 # WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR; WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX); download_mp4_video () { local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/; local session_html=$(curl -s $session_url); local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#"); echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do local mp4_filename=$(basename $mp4_url); if [ -e $WWDC_LOCAL_DIR/$mp4_filename ] then echo "> MP4 already existed : $WWDC_LOCAL_DIR/$mp4_filename" >&2; echo "> To resume broken download use curl -C - --connect-timeout 1200 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url" >&2; echo " " >&2; else echo "> MP4 Downloading... : $mp4_url" >&2; curl --connect-timeout 120 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url fi done } main () { [ ! -d $WWDC_LOCAL_DIR ] && { mkdir $WWDC_LOCAL_DIR; } #Year 2020 change {3\} to {3,5\} curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3,5\}\).*#\1#" | sort | uniq | while read SESSION_ID; do download_mp4_video $SESSION_ID; done } main;






Run this script to rename the video or subtitles to proper title (HD & SD) videos

wwdc2020_rename_title.sh    Select all
#!/bin/sh # @Last Modified by: javacom # @Last Modified time: 2020-10-17 #sd video curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.mp4" \] \&\& mv "wwdc2020_\1_sd.mp4" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash #hd video curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.mp4" \] \&\& mv "wwdc2020_\1_hd.mp4" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash #sd video en.srt curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.en.srt" \] \&\& mv "wwdc2020_\1_sd.en.srt" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.en.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash #hd video en.srt curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.en.srt" \] \&\& mv "wwdc2020_\1_hd.en.srt" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.en.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash #sd video zh.srt curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.zh.srt" \] \&\& mv "wwdc2020_\1_sd.zh.srt" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.zh.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash #hd video zh.srt curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.zh.srt" \] \&\& mv "wwdc2020_\1_hd.zh.srt" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.zh.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash # For WWDC2021 WWDC_YEAR=2021; VIDEO=sd; curl -s https://developer.apple.com/videos/wwdc${WWDC_YEAR}/ | grep -B1 "video-title" | sed -e "s#[[:space:]]\{10,\}<a href=./videos/play/wwdc202./\([0-9]\{3,5\}\).*#\[ -f \"wwdc${WWDC_YEAR}-\1_${VIDEO}.mp4\" \] \&\& mv \"wwdc${WWDC_YEAR}-\1_${VIDEO}.mp4\" \"wwdc${WWDC_YEAR}-\1_sd_#" -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | sed "s#[\/@:’\'\,]#_#g" | /bin/bash






# Example nohup commnad for Linux Download

nohup bash -c 'cd $HOME/Downloads/WWDC/; ./wwdc2020_fetch_mp4.sh' &> nohup.wwdc2020.mp4.out &

nohup bash -c 'cd $HOME/Downloads/WWDC/; ./wwdc2020_fetch_en.srt.sh' &> nohup.wwdc2020.en.srt.out &

# or use screen utility to download
screen -S wwdcdownloadmp4
bash wwdc2020_fetch_mp4.sh



If want to remove special characters in filename "brew install rename" and use this command to rename
rename "s/[\@:’\'\,]//g" *.mp4 *.srt