#!/usr/bin/env python3 """ Analyze video transcript to identify topics and create chapter divisions. """ import json import re from datetime import timedelta def seconds_to_timestamp(seconds): """Convert seconds to readable timestamp.""" td = timedelta(seconds=float(seconds)) hours, remainder = divmod(td.seconds, 3600) minutes, seconds = divmod(remainder, 60) return f"{hours:02d}:{minutes:02d}:{seconds:02d}" def load_transcript(file_path): """Load JSON transcript file.""" with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data def extract_text_with_timestamps(data): """Extract text segments with timestamps.""" segments = [] for entry in data: if 'events' in entry: for event in entry['events']: if 'segs' in event: for seg in event['segs']: if 'utf8' in seg: segments.append({ 'start': float(event.get('tStartMs', 0)) / 1000, 'text': seg['utf8'] }) return segments def clean_text(text): """Clean transcript text.""" # Remove extra whitespace text = ' '.join(text.split()) return text def identify_keywords(text): """Identify important keywords in Indonesian business context.""" keywords = { 'market': ['market', 'pasar', 'grup', 'komunitas', 'community'], 'problem': ['masalah', 'problem', 'kesulitan', 'error', 'gagal'], 'branding': ['branding', 'personal branding', 'image', 'citra'], 'funnel': ['funel', 'funnel', 'awareness', 'desire', 'action'], 'client': ['klien', 'client', 'pelanggan', 'customer'], 'price': ['harga', 'price', 'bayar', 'paid', 'invoice'], 'negotiation': ['tawar', 'negosiasi', 'deal'], 'service': ['jasa', 'service', 'website', 'plugin'], 'exploration': ['explore', 'eksplor', 'coba', 'trial'], 'network': ['network', 'jaringan', 'koneksi'], 'sharing': ['sharing', 'share', 'bagi'], 'product': ['produk', 'product', 'template', 'plugin'], 'trust': ['trust', 'percaya', 'kepercayaan'], 'sales': ['jual', 'sales', 'closing'], } return keywords def analyze_structure(): """Analyze the transcript structure.""" file_path = "/Users/dwindown/CascadeProjects/MeetDwindiCom/access-hub/Live Zoom - Diskusi Cara Jual Jasa via Online.json" print("Loading transcript...") data = load_transcript(file_path) print("Extracting segments...") segments = extract_text_with_timestamps(data) print(f"\nTotal segments: {len(segments)}") # Get total duration if segments: total_duration = segments[-1]['start'] print(f"Total duration: {seconds_to_timestamp(total_duration)} ({total_duration/60:.1f} minutes)") # Sample segments at different intervals print("\n=== SAMPLING SEGMENTS AT KEY INTERVALS ===\n") sample_points = [0, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000, 3300, 3600, 3900, 4200, 4500, 4800, 5100, 5400, 5700, 6000, 6300, 6600, 6900, 7200, 7500, 7800, 8100, 8400, 8700, 9000] for i, target_time in enumerate(sample_points): # Find closest segment closest = None min_diff = float('inf') for seg in segments: diff = abs(seg['start'] - target_time) if diff < min_diff: min_diff = diff closest = seg if closest and min_diff < 60: # Within 1 minute text = clean_text(closest['text']) if len(text) > 50: # Only meaningful segments print(f"[{seconds_to_timestamp(closest['start'])}]") print(f"{text[:200]}...") print() # Look for transition phrases print("\n=== LOOKING FOR TRANSITION PHRASES ===\n") transition_phrases = [ 'oke', 'jadi', 'nah', 'kemudian', 'selanjutnya', 'setelah itu', 'sekarang', 'selanjutnya', 'lanjut', 'terus', 'setelah', 'nah sekarang', 'oke jadi', 'jadi sekarang', 'nah kalau', ] # Look for sections mentioning main topics print("\n=== SEARCHING FOR TOPIC MENTIONS ===\n") topic_keywords = { 'Market/Community': ['market', 'pasar', 'grup', 'komunitas', 'community', 'telegram', 'facebook'], 'Problem Finding': ['masalah', 'problem', 'kesulitan', 'permasalahan'], 'Personal Branding': ['branding', 'personal branding', 'show off'], 'AIDA Funnel': ['aida', 'awareness', 'interest', 'desire', 'action', 'funel', 'funnel'], 'Getting Clients': ['klien', 'client', 'calon klien'], 'Pricing/Payment': ['harga', 'bayar', 'budget', 'invoice', 'price'], 'Negotiation': ['tawar', 'negosiasi', 'deal'], 'Trust Building': ['trust', 'percaya', 'kepercayaan'], 'Services/Products': ['jasa', 'service', 'produk', 'elementor', 'plugin', 'website'], 'Cold/Warm/Hot Market': ['cold market', 'warm market', 'hot market'], } # Find segments grouped by time periods print("\n=== CONTENT BY TIME PERIODS (every 10 minutes) ===\n") period = 600 # 10 minutes in seconds current_period = 0 while current_period < total_duration: period_end = current_period + period period_segments = [s for s in segments if current_period <= s['start'] < period_end] if period_segments: # Combine text from this period period_text = ' '.join([clean_text(s['text']) for s in period_segments[:20]]) # First 20 segments period_text = period_text[:500] # First 500 chars print(f"\n{'='*70}") print(f"PERIOD: {seconds_to_timestamp(current_period)} - {seconds_to_timestamp(period_end)}") print(f"{'='*70}") print(period_text) print() current_period = period_end if __name__ == "__main__": analyze_structure()