#!/usr/bin/env python3 """ Analyze video transcript to identify topics and create chapter divisions. """ import json import re from datetime import timedelta def seconds_to_timestamp(seconds): """Convert seconds to readable timestamp.""" td = timedelta(seconds=float(seconds)) total_seconds = int(td.total_seconds()) hours, remainder = divmod(total_seconds, 3600) minutes, seconds = divmod(remainder, 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d}" def load_transcript(file_path): """Load JSON transcript file.""" with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data def extract_transcript_segments(data): """Extract all transcript segments with timestamps.""" segments = [] # The structure has a 'tracks' key if 'tracks' in data[0]: for track in data[0]['tracks']: if track['kind'] == 'asr': # Automatic Speech Recognition for event in track['events']: start_time = event.get('tStartMs', 0) / 1000 duration = event.get('dDurationMs', 0) / 1000 # Extract text from segments text_parts = [] if 'segs' in event: for seg in event['segs']: if 'utf8' in seg: text_parts.append(seg['utf8']) text = ' '.join(text_parts) if text.strip(): segments.append({ 'start': start_time, 'end': start_time + duration, 'text': text }) return segments def group_by_time_window(segments, window_seconds=600): """Group segments into time windows for analysis.""" groups = [] current_time = 0 while current_time < segments[-1]['end']: window_end = current_time + window_seconds window_segments = [s for s in segments if current_time <= s['start'] < window_end] if window_segments: combined_text = ' '.join([s['text'] for s in window_segments]) groups.append({ 'start': current_time, 'end': window_end, 'segments': window_segments, 'text': combined_text }) current_time = window_end return groups def extract_keywords(text): """Extract key topics from text.""" keywords = { 'Market & Community': ['market', 'pasar', 'grup', 'komunitas', 'telegram', 'facebook', 'forum'], 'Problem Finding': ['masalah', 'problem', 'kesulitan', 'permasalahan', 'error'], 'Exploration': ['explor', 'coba', 'trial', 'nyoba', 'eksplor'], 'Personal Branding': ['branding', 'personal branding', 'show off', 'image'], 'AIDA/Funnel': ['aida', 'awareness', 'interest', 'desire', 'action', 'funel', 'funnel'], 'Trust': ['trust', 'percaya', 'kepercayaan'], 'Clients': ['klien', 'client', 'pelanggan', 'customer'], 'Pricing': ['harga', 'price', 'bayar', 'budget', 'rp', 'juta', 'ribu'], 'Negotiation': ['tawar', 'negosiasi', 'deal'], 'Services': ['jasa', 'service', 'website', 'plugin', 'elementor', 'instal'], 'Cold/Warm/Hot Market': ['cold', 'warm', 'hot', 'dingin', 'hangat'], 'Network': ['network', 'jaringan', 'koneksi', 'hubungan'], 'Sharing': ['sharing', 'share', 'bagi'], 'Products': ['produk', 'product', 'template'], } found = [] text_lower = text.lower() for topic, kw_list in keywords.items(): count = sum(1 for kw in kw_list if kw.lower() in text_lower) if count > 0: found.append((topic, count)) return sorted(found, key=lambda x: x[1], reverse=True) def identify_main_topics(): """Identify main topics throughout the video.""" file_path = "/Users/dwindown/CascadeProjects/MeetDwindiCom/access-hub/Live Zoom - Diskusi Cara Jual Jasa via Online.json" print("Loading transcript...") data = load_transcript(file_path) print("Extracting segments...") segments = extract_transcript_segments(data) print(f"Total segments: {len(segments)}") if not segments: print("No segments found!") return total_duration = segments[-1]['end'] print(f"Total duration: {seconds_to_timestamp(total_duration)} ({total_duration/60:.1f} minutes)") print("\n" + "="*80) print("ANALYZING CONTENT IN 10-MINUTE INTERVALS") print("="*80 + "\n") # Group by 10-minute windows groups = group_by_time_window(segments, window_seconds=600) for i, group in enumerate(groups, 1): print(f"\n{'='*80}") print(f"SECTION {i}: {seconds_to_timestamp(group['start'])} - {seconds_to_timestamp(group['end'])}") print(f"{'='*80}") # Get first 500 chars for preview preview = group['text'][:500] print(f"\nContent Preview:\n{preview}...") # Extract keywords keywords = extract_keywords(group['text']) if keywords: print(f"\nMain Topics:") for topic, count in keywords[:5]: print(f" - {topic}: {count} mentions") print("\n" + "="*80) print("DETAILED BREAKDOWN (5-minute intervals for first hour)") print("="*80 + "\n") # More detailed for first hour detailed_groups = group_by_time_window(segments[:int(len(segments)*0.4)], window_seconds=300) for i, group in enumerate(detailed_groups, 1): print(f"\n--- {seconds_to_timestamp(group['start'])} - {seconds_to_timestamp(group['end'])} ---") # Get text summary text_summary = group['text'][:300] print(f"{text_summary}...") if __name__ == "__main__": identify_main_topics()