Files
meet-hub/analyze_transcript.py
dwindown a9ad84eb23 Fix duplicate video embed when youtube_url is empty string
- Add .trim() checks to all video source conditions
- Prevents rendering empty youtube_url as valid video
- Fixes double embed card display issue
- Update sidebar icon check to use optional chaining with trim

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-30 21:11:35 +07:00

168 lines
6.0 KiB
Python

#!/usr/bin/env python3
"""
Analyze video transcript to identify topics and create chapter divisions.
"""
import json
import re
from datetime import timedelta
def seconds_to_timestamp(seconds):
"""Convert seconds to readable timestamp."""
td = timedelta(seconds=float(seconds))
hours, remainder = divmod(td.seconds, 3600)
minutes, seconds = divmod(remainder, 60)
return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
def load_transcript(file_path):
"""Load JSON transcript file."""
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def extract_text_with_timestamps(data):
"""Extract text segments with timestamps."""
segments = []
for entry in data:
if 'events' in entry:
for event in entry['events']:
if 'segs' in event:
for seg in event['segs']:
if 'utf8' in seg:
segments.append({
'start': float(event.get('tStartMs', 0)) / 1000,
'text': seg['utf8']
})
return segments
def clean_text(text):
"""Clean transcript text."""
# Remove extra whitespace
text = ' '.join(text.split())
return text
def identify_keywords(text):
"""Identify important keywords in Indonesian business context."""
keywords = {
'market': ['market', 'pasar', 'grup', 'komunitas', 'community'],
'problem': ['masalah', 'problem', 'kesulitan', 'error', 'gagal'],
'branding': ['branding', 'personal branding', 'image', 'citra'],
'funnel': ['funel', 'funnel', 'awareness', 'desire', 'action'],
'client': ['klien', 'client', 'pelanggan', 'customer'],
'price': ['harga', 'price', 'bayar', 'paid', 'invoice'],
'negotiation': ['tawar', 'negosiasi', 'deal'],
'service': ['jasa', 'service', 'website', 'plugin'],
'exploration': ['explore', 'eksplor', 'coba', 'trial'],
'network': ['network', 'jaringan', 'koneksi'],
'sharing': ['sharing', 'share', 'bagi'],
'product': ['produk', 'product', 'template', 'plugin'],
'trust': ['trust', 'percaya', 'kepercayaan'],
'sales': ['jual', 'sales', 'closing'],
}
return keywords
def analyze_structure():
"""Analyze the transcript structure."""
file_path = "/Users/dwindown/CascadeProjects/MeetDwindiCom/access-hub/Live Zoom - Diskusi Cara Jual Jasa via Online.json"
print("Loading transcript...")
data = load_transcript(file_path)
print("Extracting segments...")
segments = extract_text_with_timestamps(data)
print(f"\nTotal segments: {len(segments)}")
# Get total duration
if segments:
total_duration = segments[-1]['start']
print(f"Total duration: {seconds_to_timestamp(total_duration)} ({total_duration/60:.1f} minutes)")
# Sample segments at different intervals
print("\n=== SAMPLING SEGMENTS AT KEY INTERVALS ===\n")
sample_points = [0, 300, 600, 900, 1200, 1500, 1800, 2100, 2400, 2700, 3000,
3300, 3600, 3900, 4200, 4500, 4800, 5100, 5400, 5700, 6000,
6300, 6600, 6900, 7200, 7500, 7800, 8100, 8400, 8700, 9000]
for i, target_time in enumerate(sample_points):
# Find closest segment
closest = None
min_diff = float('inf')
for seg in segments:
diff = abs(seg['start'] - target_time)
if diff < min_diff:
min_diff = diff
closest = seg
if closest and min_diff < 60: # Within 1 minute
text = clean_text(closest['text'])
if len(text) > 50: # Only meaningful segments
print(f"[{seconds_to_timestamp(closest['start'])}]")
print(f"{text[:200]}...")
print()
# Look for transition phrases
print("\n=== LOOKING FOR TRANSITION PHRASES ===\n")
transition_phrases = [
'oke',
'jadi',
'nah',
'kemudian',
'selanjutnya',
'setelah itu',
'sekarang',
'selanjutnya',
'lanjut',
'terus',
'setelah',
'nah sekarang',
'oke jadi',
'jadi sekarang',
'nah kalau',
]
# Look for sections mentioning main topics
print("\n=== SEARCHING FOR TOPIC MENTIONS ===\n")
topic_keywords = {
'Market/Community': ['market', 'pasar', 'grup', 'komunitas', 'community', 'telegram', 'facebook'],
'Problem Finding': ['masalah', 'problem', 'kesulitan', 'permasalahan'],
'Personal Branding': ['branding', 'personal branding', 'show off'],
'AIDA Funnel': ['aida', 'awareness', 'interest', 'desire', 'action', 'funel', 'funnel'],
'Getting Clients': ['klien', 'client', 'calon klien'],
'Pricing/Payment': ['harga', 'bayar', 'budget', 'invoice', 'price'],
'Negotiation': ['tawar', 'negosiasi', 'deal'],
'Trust Building': ['trust', 'percaya', 'kepercayaan'],
'Services/Products': ['jasa', 'service', 'produk', 'elementor', 'plugin', 'website'],
'Cold/Warm/Hot Market': ['cold market', 'warm market', 'hot market'],
}
# Find segments grouped by time periods
print("\n=== CONTENT BY TIME PERIODS (every 10 minutes) ===\n")
period = 600 # 10 minutes in seconds
current_period = 0
while current_period < total_duration:
period_end = current_period + period
period_segments = [s for s in segments if current_period <= s['start'] < period_end]
if period_segments:
# Combine text from this period
period_text = ' '.join([clean_text(s['text']) for s in period_segments[:20]]) # First 20 segments
period_text = period_text[:500] # First 500 chars
print(f"\n{'='*70}")
print(f"PERIOD: {seconds_to_timestamp(current_period)} - {seconds_to_timestamp(period_end)}")
print(f"{'='*70}")
print(period_text)
print()
current_period = period_end
if __name__ == "__main__":
analyze_structure()