Create and Run this script wwdc2020_fetch_srt.sh to fetch WWDC2020 subtitle
- wwdc2020_fetch_en.srt.sh Select all
#!/bin/bash
# @Last Modified by: javacom
# @Last Modified time: 2020-10-17
WWDC_YEAR=2020;
#
WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR;
WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX);
detect_video_m3u8 () {
local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/;
local session_html=$(curl -s $session_url);
local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#");
echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do
local mp4_filename=$(basename $mp4_url | cut -d. -f1);
local srt_filename=$mp4_filename.en.srt;
# local srt_filename=$mp4_filename.zh.srt;
echo "> Subtitle local: $WWDC_LOCAL_DIR/$srt_filename" >&2;
> $WWDC_LOCAL_DIR/$srt_filename;
done
echo "$video_url";
echo "> Video: $video_url" >&2;
}
detect_subtitle_m3u8 () {
local video_url=$1;
# en subtitle
local subtitle_uri=$(curl -s $video_url | grep "LANGUAGE=\"en\",URI" | sed "s#.*URI=\"\(.*\)\"#\1#");
# zh subtitle
#local subtitle_uri=$(curl -s $video_url | grep "LANGUAGE=\"zh\"" | sed "s#.*URI=\"\(.*\)\"#\1#");
local subtitle_url=$subtitle_uri;
[[ "$subtitle_uri" != http* ]] && {
subtitle_url=$(dirname $video_url)/$subtitle_uri;
}
echo "$subtitle_url";
echo "> Subtitle: $subtitle_url" >&2;
}
download_subtitle_contents () {
local subtitle_url=$1;
echo "> Downloading... "
local subtitle_base_url=$(dirname $subtitle_url);
curl -s $subtitle_url | grep "webvtt" | while read webvtt; do
local subtitle_webvtt=$subtitle_base_url/$webvtt;
#echo "- get $subtitle_webvtt";
local subtitle_content=$(curl -s $subtitle_webvtt);
# en subtitle
ls $WWDC_LOCAL_DIR/"wwdc$WWDC_YEAR"_"$SESSION_ID"*.en.srt | while read srt_file; do
# zh subtitle
# ls $WWDC_LOCAL_DIR/"wwdc$WWDC_YEAR"_"$SESSION_ID"*.zh.srt | while read srt_file; do
echo "$subtitle_content" >> $srt_file;
done
done
}
main () {
[ ! -d $WWDC_LOCAL_DIR ] && {
mkdir $WWDC_LOCAL_DIR;
}
#Year 2020 change {3\} to {3,5\}
curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3,5\}\).*#\1#" | sort | uniq | while read SESSION_ID; do
#echo "SESSION_ID is" $SESSION_ID
local video_url=$(detect_video_m3u8 $SESSION_ID);
local subtitle_url=$(detect_subtitle_m3u8 $video_url);
download_subtitle_contents $subtitle_url;
done
}
main;
Run this shell script to format as SRT subtitle
- shellscript.sh Select all
WWDC_YEAR=2020;
#
cd wwdc$WWDC_YEAR
mkdir -p sd
mkdir -p hd
for i in *_sd.??.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;s/align.middle line.*$//;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/>/>/g;s/</</g;1,2d;' > sd/$i; done
for i in *_hd.??.srt; do sed -e '/WEBVTT/d;/X-TIMESTAMP/d;s/align.middle line.*$//;' $i | awk '/^[0-9]{2}:[0-9]{2}:/ {seen[$0]++; skipduplicated=0} {if (seen[$0]>1) skipduplicated=1; if (!skipduplicated) print $0}' | awk -v RS="" '{gsub("\n", "-Z"); print}' | awk '$0 !~/^WEB/ {print $0}' | uniq | awk '{printf "\n%s-Z%s", NR,$0 }' | awk -v ORS="\n\n" '{gsub("-Z", "\n"); print}' | sed -e 's/.A:middle$//g;s/>/>/g;s/</</g;1,2d;' > hd/$i; done
Run this script wwdc2020_fetch_mp4.sh to download all mp4 (HD and SD) videos (also works for 2021)
- wwdc2020_fetch_mp4.sh  Select all
#!/bin/bash
# @Last Modified by: javacom
# @Last Modified time: 2020-10-17
WWDC_YEAR=2020; # change to 2021 also works for WWDC2021
#
WWDC_SESSION_PREFIX=https://developer.apple.com/videos/play/wwdc$WWDC_YEAR;
WWDC_LOCAL_DIR=$(basename $WWDC_SESSION_PREFIX);
download_mp4_video () {
local session_url=$WWDC_SESSION_PREFIX/$SESSION_ID/;
local session_html=$(curl -s $session_url);
local video_url=$(echo "$session_html" | grep .m3u8 | grep $SESSION_ID | head -n1 | sed "s#.*\"\(https://.*m3u8\)\".*#\1#");
echo "$session_html" | grep .mp4 | grep $SESSION_ID | sed "s#.*\"\(https://.*mp4\).*\".*#\1#" | while read mp4_url; do
local mp4_filename=$(basename $mp4_url);
if [ -e $WWDC_LOCAL_DIR/$mp4_filename ]
then
echo "> MP4 already existed : $WWDC_LOCAL_DIR/$mp4_filename" >&2;
echo "> To resume broken download use curl -C - --connect-timeout 1200 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url" >&2;
echo " " >&2;
else
echo "> MP4 Downloading... : $mp4_url" >&2;
curl --connect-timeout 120 -o $WWDC_LOCAL_DIR/$mp4_filename $mp4_url
fi
done
}
main () {
[ ! -d $WWDC_LOCAL_DIR ] && {
mkdir $WWDC_LOCAL_DIR;
}
#Year 2020 change {3\} to {3,5\}
curl -s $WWDC_SESSION_PREFIX | grep /videos/play/wwdc$WWDC_YEAR | sed "s#.*/videos/play/wwdc$WWDC_YEAR/\([0-9]\{3,5\}\).*#\1#" | sort | uniq | while read SESSION_ID; do
download_mp4_video $SESSION_ID;
done
}
main;
Run this script to rename the video or subtitles to proper title (HD & SD) videos
- wwdc2020_rename_title.sh  Select all
#!/bin/sh
# @Last Modified by: javacom
# @Last Modified time: 2020-10-17
#sd video
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.mp4" \] \&\& mv "wwdc2020_\1_sd.mp4" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
#hd video
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.mp4" \] \&\& mv "wwdc2020_\1_hd.mp4" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
#sd video en.srt
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.en.srt" \] \&\& mv "wwdc2020_\1_sd.en.srt" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.en.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
#hd video en.srt
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.en.srt" \] \&\& mv "wwdc2020_\1_hd.en.srt" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.en.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
#sd video zh.srt
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_sd.zh.srt" \] \&\& mv "wwdc2020_\1_sd.zh.srt" "wwdc2020_\1_sd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.zh.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
#hd video zh.srt
curl -s https://developer.apple.com/videos/wwdc2020/ | grep -B1 "video-title" | sed -e 's#[[:space:]]\{10,\}<a href="/videos/play/wwdc2020/\([0-9]\{3,5\}\).*#\[ -f "wwdc2020_\1_hd.zh.srt" \] \&\& mv "wwdc2020_\1_hd.zh.srt" "wwdc2020_\1_hd_#' -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.zh.srt"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | /bin/bash
# For WWDC2021
WWDC_YEAR=2021; VIDEO=sd; curl -s https://developer.apple.com/videos/wwdc${WWDC_YEAR}/ | grep -B1 "video-title" | sed -e "s#[[:space:]]\{10,\}<a href=./videos/play/wwdc202./\([0-9]\{3,5\}\).*#\[ -f \"wwdc${WWDC_YEAR}-\1_${VIDEO}.mp4\" \] \&\& mv \"wwdc${WWDC_YEAR}-\1_${VIDEO}.mp4\" \"wwdc${WWDC_YEAR}-\1_sd_#" -e "s/[\@:’\'\,?]//g" -e "s/\"\(The.*\)\"/\1/" -e 's#.*video-title..\(.*\)\(</h4>\)#\1.mp4"#' -e '/--/d' | sed '/^\[ -f/{N;s/\n//;}' | sed "s#[\/@:’\'\,]#_#g" | /bin/bash
# Example nohup commnad for Linux Download
nohup bash -c 'cd $HOME/Downloads/WWDC/; ./wwdc2020_fetch_mp4.sh' &> nohup.wwdc2020.mp4.out &
nohup bash -c 'cd $HOME/Downloads/WWDC/; ./wwdc2020_fetch_en.srt.sh' &> nohup.wwdc2020.en.srt.out &
# or use screen utility to download
screen -S wwdcdownloadmp4
bash wwdc2020_fetch_mp4.sh
If want to remove special characters in filename "brew install rename" and use this command to rename
rename "s/[\@:’\'\,]//g" *.mp4 *.srt
No comments:
Post a Comment