qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

ppc/spapr: Don't kill the guest if a recovered FWNMI machine check delivery fails

Try to be tolerant of FWNMI delivery errors if the machine check had been
recovered by the host.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Message-Id: <20200325142906.221248-5-npiggin@gmail.com>
Reviewed-by: Greg Kurz <groug@kaod.org>
[dwg: Updated comment at Greg's suggestion]
Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

authored by

Nicholas Piggin and committed by
David Gibson
4f7a11f9 b90b9ecb

+25 -5
+25 -5
hw/ppc/spapr_events.c
··· 833 833 /* get rtas addr from fdt */ 834 834 rtas_addr = spapr_get_rtas_addr(); 835 835 if (!rtas_addr) { 836 - error_report( 836 + if (!recovered) { 837 + error_report( 837 838 "FWNMI: Unable to deliver machine check to guest: rtas_addr not found."); 838 - qemu_system_guest_panicked(NULL); 839 + qemu_system_guest_panicked(NULL); 840 + } else { 841 + warn_report( 842 + "FWNMI: Unable to deliver machine check to guest: rtas_addr not found. " 843 + "Machine check recovered."); 844 + } 839 845 g_free(ext_elog); 840 846 return; 841 847 } 842 848 849 + /* 850 + * By taking the interlock, we assume that the MCE will be 851 + * delivered to the guest. CAUTION: don't add anything that could 852 + * prevent the MCE to be delivered after this line, otherwise the 853 + * guest won't be able to release the interlock and ultimately 854 + * hang/crash? 855 + */ 856 + spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; 857 + 843 858 stq_be_phys(&address_space_memory, rtas_addr + RTAS_ERROR_LOG_OFFSET, 844 859 env->gpr[3]); 845 860 cpu_physical_memory_write(rtas_addr + RTAS_ERROR_LOG_OFFSET + ··· 876 891 * that CPU called "ibm,nmi-interlock") 877 892 */ 878 893 if (spapr->fwnmi_machine_check_interlock == cpu->vcpu_id) { 879 - error_report( 894 + if (!recovered) { 895 + error_report( 880 896 "FWNMI: Unable to deliver machine check to guest: nested machine check."); 881 - qemu_system_guest_panicked(NULL); 897 + qemu_system_guest_panicked(NULL); 898 + } else { 899 + warn_report( 900 + "FWNMI: Unable to deliver machine check to guest: nested machine check. " 901 + "Machine check recovered."); 902 + } 882 903 return; 883 904 } 884 905 qemu_cond_wait_iothread(&spapr->fwnmi_machine_check_interlock_cond); ··· 906 927 warn_report("Received a fwnmi while migration was in progress"); 907 928 } 908 929 909 - spapr->fwnmi_machine_check_interlock = cpu->vcpu_id; 910 930 spapr_mce_dispatch_elog(cpu, recovered); 911 931 } 912 932