qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio
at jcs-vmm 4052 lines 122 kB view raw
1/* 2 * Virtual page mapping 3 * 4 * Copyright (c) 2003 Fabrice Bellard 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with this library; if not, see <http://www.gnu.org/licenses/>. 18 */ 19 20#include "qemu/osdep.h" 21#include "qemu-common.h" 22#include "qapi/error.h" 23 24#include "qemu/cutils.h" 25#include "cpu.h" 26#include "exec/exec-all.h" 27#include "exec/target_page.h" 28#include "tcg/tcg.h" 29#include "hw/qdev-core.h" 30#include "hw/qdev-properties.h" 31#if !defined(CONFIG_USER_ONLY) 32#include "hw/boards.h" 33#include "hw/xen/xen.h" 34#endif 35#include "sysemu/kvm.h" 36#include "sysemu/sysemu.h" 37#include "sysemu/tcg.h" 38#include "sysemu/qtest.h" 39#include "qemu/timer.h" 40#include "qemu/config-file.h" 41#include "qemu/error-report.h" 42#include "qemu/qemu-print.h" 43#if defined(CONFIG_USER_ONLY) 44#include "qemu.h" 45#else /* !CONFIG_USER_ONLY */ 46#include "exec/memory.h" 47#include "exec/ioport.h" 48#include "sysemu/dma.h" 49#include "sysemu/hostmem.h" 50#include "sysemu/hw_accel.h" 51#include "exec/address-spaces.h" 52#include "sysemu/xen-mapcache.h" 53#include "trace-root.h" 54 55#ifdef CONFIG_FALLOCATE_PUNCH_HOLE 56#include <linux/falloc.h> 57#endif 58 59#endif 60#include "qemu/rcu_queue.h" 61#include "qemu/main-loop.h" 62#include "translate-all.h" 63#include "sysemu/replay.h" 64 65#include "exec/memory-internal.h" 66#include "exec/ram_addr.h" 67#include "exec/log.h" 68 69#include "qemu/pmem.h" 70 71#include "migration/vmstate.h" 72 73#include "qemu/range.h" 74#ifndef _WIN32 75#include "qemu/mmap-alloc.h" 76#endif 77 78#include "monitor/monitor.h" 79 80//#define DEBUG_SUBPAGE 81 82#if !defined(CONFIG_USER_ONLY) 83/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes 84 * are protected by the ramlist lock. 85 */ 86RAMList ram_list = { .blocks = QLIST_HEAD_INITIALIZER(ram_list.blocks) }; 87 88static MemoryRegion *system_memory; 89static MemoryRegion *system_io; 90 91AddressSpace address_space_io; 92AddressSpace address_space_memory; 93 94static MemoryRegion io_mem_unassigned; 95#endif 96 97CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus); 98 99/* current CPU in the current thread. It is only valid inside 100 cpu_exec() */ 101__thread CPUState *current_cpu; 102 103uintptr_t qemu_host_page_size; 104intptr_t qemu_host_page_mask; 105 106#if !defined(CONFIG_USER_ONLY) 107/* 0 = Do not count executed instructions. 108 1 = Precise instruction counting. 109 2 = Adaptive rate instruction counting. */ 110int use_icount; 111 112typedef struct PhysPageEntry PhysPageEntry; 113 114struct PhysPageEntry { 115 /* How many bits skip to next level (in units of L2_SIZE). 0 for a leaf. */ 116 uint32_t skip : 6; 117 /* index into phys_sections (!skip) or phys_map_nodes (skip) */ 118 uint32_t ptr : 26; 119}; 120 121#define PHYS_MAP_NODE_NIL (((uint32_t)~0) >> 6) 122 123/* Size of the L2 (and L3, etc) page tables. */ 124#define ADDR_SPACE_BITS 64 125 126#define P_L2_BITS 9 127#define P_L2_SIZE (1 << P_L2_BITS) 128 129#define P_L2_LEVELS (((ADDR_SPACE_BITS - TARGET_PAGE_BITS - 1) / P_L2_BITS) + 1) 130 131typedef PhysPageEntry Node[P_L2_SIZE]; 132 133typedef struct PhysPageMap { 134 struct rcu_head rcu; 135 136 unsigned sections_nb; 137 unsigned sections_nb_alloc; 138 unsigned nodes_nb; 139 unsigned nodes_nb_alloc; 140 Node *nodes; 141 MemoryRegionSection *sections; 142} PhysPageMap; 143 144struct AddressSpaceDispatch { 145 MemoryRegionSection *mru_section; 146 /* This is a multi-level map on the physical address space. 147 * The bottom level has pointers to MemoryRegionSections. 148 */ 149 PhysPageEntry phys_map; 150 PhysPageMap map; 151}; 152 153#define SUBPAGE_IDX(addr) ((addr) & ~TARGET_PAGE_MASK) 154typedef struct subpage_t { 155 MemoryRegion iomem; 156 FlatView *fv; 157 hwaddr base; 158 uint16_t sub_section[]; 159} subpage_t; 160 161#define PHYS_SECTION_UNASSIGNED 0 162 163static void io_mem_init(void); 164static void memory_map_init(void); 165static void tcg_log_global_after_sync(MemoryListener *listener); 166static void tcg_commit(MemoryListener *listener); 167 168/** 169 * CPUAddressSpace: all the information a CPU needs about an AddressSpace 170 * @cpu: the CPU whose AddressSpace this is 171 * @as: the AddressSpace itself 172 * @memory_dispatch: its dispatch pointer (cached, RCU protected) 173 * @tcg_as_listener: listener for tracking changes to the AddressSpace 174 */ 175struct CPUAddressSpace { 176 CPUState *cpu; 177 AddressSpace *as; 178 struct AddressSpaceDispatch *memory_dispatch; 179 MemoryListener tcg_as_listener; 180}; 181 182struct DirtyBitmapSnapshot { 183 ram_addr_t start; 184 ram_addr_t end; 185 unsigned long dirty[]; 186}; 187 188#endif 189 190#if !defined(CONFIG_USER_ONLY) 191 192static void phys_map_node_reserve(PhysPageMap *map, unsigned nodes) 193{ 194 static unsigned alloc_hint = 16; 195 if (map->nodes_nb + nodes > map->nodes_nb_alloc) { 196 map->nodes_nb_alloc = MAX(alloc_hint, map->nodes_nb + nodes); 197 map->nodes = g_renew(Node, map->nodes, map->nodes_nb_alloc); 198 alloc_hint = map->nodes_nb_alloc; 199 } 200} 201 202static uint32_t phys_map_node_alloc(PhysPageMap *map, bool leaf) 203{ 204 unsigned i; 205 uint32_t ret; 206 PhysPageEntry e; 207 PhysPageEntry *p; 208 209 ret = map->nodes_nb++; 210 p = map->nodes[ret]; 211 assert(ret != PHYS_MAP_NODE_NIL); 212 assert(ret != map->nodes_nb_alloc); 213 214 e.skip = leaf ? 0 : 1; 215 e.ptr = leaf ? PHYS_SECTION_UNASSIGNED : PHYS_MAP_NODE_NIL; 216 for (i = 0; i < P_L2_SIZE; ++i) { 217 memcpy(&p[i], &e, sizeof(e)); 218 } 219 return ret; 220} 221 222static void phys_page_set_level(PhysPageMap *map, PhysPageEntry *lp, 223 hwaddr *index, uint64_t *nb, uint16_t leaf, 224 int level) 225{ 226 PhysPageEntry *p; 227 hwaddr step = (hwaddr)1 << (level * P_L2_BITS); 228 229 if (lp->skip && lp->ptr == PHYS_MAP_NODE_NIL) { 230 lp->ptr = phys_map_node_alloc(map, level == 0); 231 } 232 p = map->nodes[lp->ptr]; 233 lp = &p[(*index >> (level * P_L2_BITS)) & (P_L2_SIZE - 1)]; 234 235 while (*nb && lp < &p[P_L2_SIZE]) { 236 if ((*index & (step - 1)) == 0 && *nb >= step) { 237 lp->skip = 0; 238 lp->ptr = leaf; 239 *index += step; 240 *nb -= step; 241 } else { 242 phys_page_set_level(map, lp, index, nb, leaf, level - 1); 243 } 244 ++lp; 245 } 246} 247 248static void phys_page_set(AddressSpaceDispatch *d, 249 hwaddr index, uint64_t nb, 250 uint16_t leaf) 251{ 252 /* Wildly overreserve - it doesn't matter much. */ 253 phys_map_node_reserve(&d->map, 3 * P_L2_LEVELS); 254 255 phys_page_set_level(&d->map, &d->phys_map, &index, &nb, leaf, P_L2_LEVELS - 1); 256} 257 258/* Compact a non leaf page entry. Simply detect that the entry has a single child, 259 * and update our entry so we can skip it and go directly to the destination. 260 */ 261static void phys_page_compact(PhysPageEntry *lp, Node *nodes) 262{ 263 unsigned valid_ptr = P_L2_SIZE; 264 int valid = 0; 265 PhysPageEntry *p; 266 int i; 267 268 if (lp->ptr == PHYS_MAP_NODE_NIL) { 269 return; 270 } 271 272 p = nodes[lp->ptr]; 273 for (i = 0; i < P_L2_SIZE; i++) { 274 if (p[i].ptr == PHYS_MAP_NODE_NIL) { 275 continue; 276 } 277 278 valid_ptr = i; 279 valid++; 280 if (p[i].skip) { 281 phys_page_compact(&p[i], nodes); 282 } 283 } 284 285 /* We can only compress if there's only one child. */ 286 if (valid != 1) { 287 return; 288 } 289 290 assert(valid_ptr < P_L2_SIZE); 291 292 /* Don't compress if it won't fit in the # of bits we have. */ 293 if (P_L2_LEVELS >= (1 << 6) && 294 lp->skip + p[valid_ptr].skip >= (1 << 6)) { 295 return; 296 } 297 298 lp->ptr = p[valid_ptr].ptr; 299 if (!p[valid_ptr].skip) { 300 /* If our only child is a leaf, make this a leaf. */ 301 /* By design, we should have made this node a leaf to begin with so we 302 * should never reach here. 303 * But since it's so simple to handle this, let's do it just in case we 304 * change this rule. 305 */ 306 lp->skip = 0; 307 } else { 308 lp->skip += p[valid_ptr].skip; 309 } 310} 311 312void address_space_dispatch_compact(AddressSpaceDispatch *d) 313{ 314 if (d->phys_map.skip) { 315 phys_page_compact(&d->phys_map, d->map.nodes); 316 } 317} 318 319static inline bool section_covers_addr(const MemoryRegionSection *section, 320 hwaddr addr) 321{ 322 /* Memory topology clips a memory region to [0, 2^64); size.hi > 0 means 323 * the section must cover the entire address space. 324 */ 325 return int128_gethi(section->size) || 326 range_covers_byte(section->offset_within_address_space, 327 int128_getlo(section->size), addr); 328} 329 330static MemoryRegionSection *phys_page_find(AddressSpaceDispatch *d, hwaddr addr) 331{ 332 PhysPageEntry lp = d->phys_map, *p; 333 Node *nodes = d->map.nodes; 334 MemoryRegionSection *sections = d->map.sections; 335 hwaddr index = addr >> TARGET_PAGE_BITS; 336 int i; 337 338 for (i = P_L2_LEVELS; lp.skip && (i -= lp.skip) >= 0;) { 339 if (lp.ptr == PHYS_MAP_NODE_NIL) { 340 return &sections[PHYS_SECTION_UNASSIGNED]; 341 } 342 p = nodes[lp.ptr]; 343 lp = p[(index >> (i * P_L2_BITS)) & (P_L2_SIZE - 1)]; 344 } 345 346 if (section_covers_addr(&sections[lp.ptr], addr)) { 347 return &sections[lp.ptr]; 348 } else { 349 return &sections[PHYS_SECTION_UNASSIGNED]; 350 } 351} 352 353/* Called from RCU critical section */ 354static MemoryRegionSection *address_space_lookup_region(AddressSpaceDispatch *d, 355 hwaddr addr, 356 bool resolve_subpage) 357{ 358 MemoryRegionSection *section = atomic_read(&d->mru_section); 359 subpage_t *subpage; 360 361 if (!section || section == &d->map.sections[PHYS_SECTION_UNASSIGNED] || 362 !section_covers_addr(section, addr)) { 363 section = phys_page_find(d, addr); 364 atomic_set(&d->mru_section, section); 365 } 366 if (resolve_subpage && section->mr->subpage) { 367 subpage = container_of(section->mr, subpage_t, iomem); 368 section = &d->map.sections[subpage->sub_section[SUBPAGE_IDX(addr)]]; 369 } 370 return section; 371} 372 373/* Called from RCU critical section */ 374static MemoryRegionSection * 375address_space_translate_internal(AddressSpaceDispatch *d, hwaddr addr, hwaddr *xlat, 376 hwaddr *plen, bool resolve_subpage) 377{ 378 MemoryRegionSection *section; 379 MemoryRegion *mr; 380 Int128 diff; 381 382 section = address_space_lookup_region(d, addr, resolve_subpage); 383 /* Compute offset within MemoryRegionSection */ 384 addr -= section->offset_within_address_space; 385 386 /* Compute offset within MemoryRegion */ 387 *xlat = addr + section->offset_within_region; 388 389 mr = section->mr; 390 391 /* MMIO registers can be expected to perform full-width accesses based only 392 * on their address, without considering adjacent registers that could 393 * decode to completely different MemoryRegions. When such registers 394 * exist (e.g. I/O ports 0xcf8 and 0xcf9 on most PC chipsets), MMIO 395 * regions overlap wildly. For this reason we cannot clamp the accesses 396 * here. 397 * 398 * If the length is small (as is the case for address_space_ldl/stl), 399 * everything works fine. If the incoming length is large, however, 400 * the caller really has to do the clamping through memory_access_size. 401 */ 402 if (memory_region_is_ram(mr)) { 403 diff = int128_sub(section->size, int128_make64(addr)); 404 *plen = int128_get64(int128_min(diff, int128_make64(*plen))); 405 } 406 return section; 407} 408 409/** 410 * address_space_translate_iommu - translate an address through an IOMMU 411 * memory region and then through the target address space. 412 * 413 * @iommu_mr: the IOMMU memory region that we start the translation from 414 * @addr: the address to be translated through the MMU 415 * @xlat: the translated address offset within the destination memory region. 416 * It cannot be %NULL. 417 * @plen_out: valid read/write length of the translated address. It 418 * cannot be %NULL. 419 * @page_mask_out: page mask for the translated address. This 420 * should only be meaningful for IOMMU translated 421 * addresses, since there may be huge pages that this bit 422 * would tell. It can be %NULL if we don't care about it. 423 * @is_write: whether the translation operation is for write 424 * @is_mmio: whether this can be MMIO, set true if it can 425 * @target_as: the address space targeted by the IOMMU 426 * @attrs: transaction attributes 427 * 428 * This function is called from RCU critical section. It is the common 429 * part of flatview_do_translate and address_space_translate_cached. 430 */ 431static MemoryRegionSection address_space_translate_iommu(IOMMUMemoryRegion *iommu_mr, 432 hwaddr *xlat, 433 hwaddr *plen_out, 434 hwaddr *page_mask_out, 435 bool is_write, 436 bool is_mmio, 437 AddressSpace **target_as, 438 MemTxAttrs attrs) 439{ 440 MemoryRegionSection *section; 441 hwaddr page_mask = (hwaddr)-1; 442 443 do { 444 hwaddr addr = *xlat; 445 IOMMUMemoryRegionClass *imrc = memory_region_get_iommu_class_nocheck(iommu_mr); 446 int iommu_idx = 0; 447 IOMMUTLBEntry iotlb; 448 449 if (imrc->attrs_to_index) { 450 iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); 451 } 452 453 iotlb = imrc->translate(iommu_mr, addr, is_write ? 454 IOMMU_WO : IOMMU_RO, iommu_idx); 455 456 if (!(iotlb.perm & (1 << is_write))) { 457 goto unassigned; 458 } 459 460 addr = ((iotlb.translated_addr & ~iotlb.addr_mask) 461 | (addr & iotlb.addr_mask)); 462 page_mask &= iotlb.addr_mask; 463 *plen_out = MIN(*plen_out, (addr | iotlb.addr_mask) - addr + 1); 464 *target_as = iotlb.target_as; 465 466 section = address_space_translate_internal( 467 address_space_to_dispatch(iotlb.target_as), addr, xlat, 468 plen_out, is_mmio); 469 470 iommu_mr = memory_region_get_iommu(section->mr); 471 } while (unlikely(iommu_mr)); 472 473 if (page_mask_out) { 474 *page_mask_out = page_mask; 475 } 476 return *section; 477 478unassigned: 479 return (MemoryRegionSection) { .mr = &io_mem_unassigned }; 480} 481 482/** 483 * flatview_do_translate - translate an address in FlatView 484 * 485 * @fv: the flat view that we want to translate on 486 * @addr: the address to be translated in above address space 487 * @xlat: the translated address offset within memory region. It 488 * cannot be @NULL. 489 * @plen_out: valid read/write length of the translated address. It 490 * can be @NULL when we don't care about it. 491 * @page_mask_out: page mask for the translated address. This 492 * should only be meaningful for IOMMU translated 493 * addresses, since there may be huge pages that this bit 494 * would tell. It can be @NULL if we don't care about it. 495 * @is_write: whether the translation operation is for write 496 * @is_mmio: whether this can be MMIO, set true if it can 497 * @target_as: the address space targeted by the IOMMU 498 * @attrs: memory transaction attributes 499 * 500 * This function is called from RCU critical section 501 */ 502static MemoryRegionSection flatview_do_translate(FlatView *fv, 503 hwaddr addr, 504 hwaddr *xlat, 505 hwaddr *plen_out, 506 hwaddr *page_mask_out, 507 bool is_write, 508 bool is_mmio, 509 AddressSpace **target_as, 510 MemTxAttrs attrs) 511{ 512 MemoryRegionSection *section; 513 IOMMUMemoryRegion *iommu_mr; 514 hwaddr plen = (hwaddr)(-1); 515 516 if (!plen_out) { 517 plen_out = &plen; 518 } 519 520 section = address_space_translate_internal( 521 flatview_to_dispatch(fv), addr, xlat, 522 plen_out, is_mmio); 523 524 iommu_mr = memory_region_get_iommu(section->mr); 525 if (unlikely(iommu_mr)) { 526 return address_space_translate_iommu(iommu_mr, xlat, 527 plen_out, page_mask_out, 528 is_write, is_mmio, 529 target_as, attrs); 530 } 531 if (page_mask_out) { 532 /* Not behind an IOMMU, use default page size. */ 533 *page_mask_out = ~TARGET_PAGE_MASK; 534 } 535 536 return *section; 537} 538 539/* Called from RCU critical section */ 540IOMMUTLBEntry address_space_get_iotlb_entry(AddressSpace *as, hwaddr addr, 541 bool is_write, MemTxAttrs attrs) 542{ 543 MemoryRegionSection section; 544 hwaddr xlat, page_mask; 545 546 /* 547 * This can never be MMIO, and we don't really care about plen, 548 * but page mask. 549 */ 550 section = flatview_do_translate(address_space_to_flatview(as), addr, &xlat, 551 NULL, &page_mask, is_write, false, &as, 552 attrs); 553 554 /* Illegal translation */ 555 if (section.mr == &io_mem_unassigned) { 556 goto iotlb_fail; 557 } 558 559 /* Convert memory region offset into address space offset */ 560 xlat += section.offset_within_address_space - 561 section.offset_within_region; 562 563 return (IOMMUTLBEntry) { 564 .target_as = as, 565 .iova = addr & ~page_mask, 566 .translated_addr = xlat & ~page_mask, 567 .addr_mask = page_mask, 568 /* IOTLBs are for DMAs, and DMA only allows on RAMs. */ 569 .perm = IOMMU_RW, 570 }; 571 572iotlb_fail: 573 return (IOMMUTLBEntry) {0}; 574} 575 576/* Called from RCU critical section */ 577MemoryRegion *flatview_translate(FlatView *fv, hwaddr addr, hwaddr *xlat, 578 hwaddr *plen, bool is_write, 579 MemTxAttrs attrs) 580{ 581 MemoryRegion *mr; 582 MemoryRegionSection section; 583 AddressSpace *as = NULL; 584 585 /* This can be MMIO, so setup MMIO bit. */ 586 section = flatview_do_translate(fv, addr, xlat, plen, NULL, 587 is_write, true, &as, attrs); 588 mr = section.mr; 589 590 if (xen_enabled() && memory_access_is_direct(mr, is_write)) { 591 hwaddr page = ((addr & TARGET_PAGE_MASK) + TARGET_PAGE_SIZE) - addr; 592 *plen = MIN(page, *plen); 593 } 594 595 return mr; 596} 597 598typedef struct TCGIOMMUNotifier { 599 IOMMUNotifier n; 600 MemoryRegion *mr; 601 CPUState *cpu; 602 int iommu_idx; 603 bool active; 604} TCGIOMMUNotifier; 605 606static void tcg_iommu_unmap_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb) 607{ 608 TCGIOMMUNotifier *notifier = container_of(n, TCGIOMMUNotifier, n); 609 610 if (!notifier->active) { 611 return; 612 } 613 tlb_flush(notifier->cpu); 614 notifier->active = false; 615 /* We leave the notifier struct on the list to avoid reallocating it later. 616 * Generally the number of IOMMUs a CPU deals with will be small. 617 * In any case we can't unregister the iommu notifier from a notify 618 * callback. 619 */ 620} 621 622static void tcg_register_iommu_notifier(CPUState *cpu, 623 IOMMUMemoryRegion *iommu_mr, 624 int iommu_idx) 625{ 626 /* Make sure this CPU has an IOMMU notifier registered for this 627 * IOMMU/IOMMU index combination, so that we can flush its TLB 628 * when the IOMMU tells us the mappings we've cached have changed. 629 */ 630 MemoryRegion *mr = MEMORY_REGION(iommu_mr); 631 TCGIOMMUNotifier *notifier; 632 Error *err = NULL; 633 int i, ret; 634 635 for (i = 0; i < cpu->iommu_notifiers->len; i++) { 636 notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i); 637 if (notifier->mr == mr && notifier->iommu_idx == iommu_idx) { 638 break; 639 } 640 } 641 if (i == cpu->iommu_notifiers->len) { 642 /* Not found, add a new entry at the end of the array */ 643 cpu->iommu_notifiers = g_array_set_size(cpu->iommu_notifiers, i + 1); 644 notifier = g_new0(TCGIOMMUNotifier, 1); 645 g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i) = notifier; 646 647 notifier->mr = mr; 648 notifier->iommu_idx = iommu_idx; 649 notifier->cpu = cpu; 650 /* Rather than trying to register interest in the specific part 651 * of the iommu's address space that we've accessed and then 652 * expand it later as subsequent accesses touch more of it, we 653 * just register interest in the whole thing, on the assumption 654 * that iommu reconfiguration will be rare. 655 */ 656 iommu_notifier_init(&notifier->n, 657 tcg_iommu_unmap_notify, 658 IOMMU_NOTIFIER_UNMAP, 659 0, 660 HWADDR_MAX, 661 iommu_idx); 662 ret = memory_region_register_iommu_notifier(notifier->mr, &notifier->n, 663 &err); 664 if (ret) { 665 error_report_err(err); 666 exit(1); 667 } 668 } 669 670 if (!notifier->active) { 671 notifier->active = true; 672 } 673} 674 675static void tcg_iommu_free_notifier_list(CPUState *cpu) 676{ 677 /* Destroy the CPU's notifier list */ 678 int i; 679 TCGIOMMUNotifier *notifier; 680 681 for (i = 0; i < cpu->iommu_notifiers->len; i++) { 682 notifier = g_array_index(cpu->iommu_notifiers, TCGIOMMUNotifier *, i); 683 memory_region_unregister_iommu_notifier(notifier->mr, &notifier->n); 684 g_free(notifier); 685 } 686 g_array_free(cpu->iommu_notifiers, true); 687} 688 689/* Called from RCU critical section */ 690MemoryRegionSection * 691address_space_translate_for_iotlb(CPUState *cpu, int asidx, hwaddr addr, 692 hwaddr *xlat, hwaddr *plen, 693 MemTxAttrs attrs, int *prot) 694{ 695 MemoryRegionSection *section; 696 IOMMUMemoryRegion *iommu_mr; 697 IOMMUMemoryRegionClass *imrc; 698 IOMMUTLBEntry iotlb; 699 int iommu_idx; 700 AddressSpaceDispatch *d = atomic_rcu_read(&cpu->cpu_ases[asidx].memory_dispatch); 701 702 for (;;) { 703 section = address_space_translate_internal(d, addr, &addr, plen, false); 704 705 iommu_mr = memory_region_get_iommu(section->mr); 706 if (!iommu_mr) { 707 break; 708 } 709 710 imrc = memory_region_get_iommu_class_nocheck(iommu_mr); 711 712 iommu_idx = imrc->attrs_to_index(iommu_mr, attrs); 713 tcg_register_iommu_notifier(cpu, iommu_mr, iommu_idx); 714 /* We need all the permissions, so pass IOMMU_NONE so the IOMMU 715 * doesn't short-cut its translation table walk. 716 */ 717 iotlb = imrc->translate(iommu_mr, addr, IOMMU_NONE, iommu_idx); 718 addr = ((iotlb.translated_addr & ~iotlb.addr_mask) 719 | (addr & iotlb.addr_mask)); 720 /* Update the caller's prot bits to remove permissions the IOMMU 721 * is giving us a failure response for. If we get down to no 722 * permissions left at all we can give up now. 723 */ 724 if (!(iotlb.perm & IOMMU_RO)) { 725 *prot &= ~(PAGE_READ | PAGE_EXEC); 726 } 727 if (!(iotlb.perm & IOMMU_WO)) { 728 *prot &= ~PAGE_WRITE; 729 } 730 731 if (!*prot) { 732 goto translate_fail; 733 } 734 735 d = flatview_to_dispatch(address_space_to_flatview(iotlb.target_as)); 736 } 737 738 assert(!memory_region_is_iommu(section->mr)); 739 *xlat = addr; 740 return section; 741 742translate_fail: 743 return &d->map.sections[PHYS_SECTION_UNASSIGNED]; 744} 745#endif 746 747#if !defined(CONFIG_USER_ONLY) 748 749static int cpu_common_post_load(void *opaque, int version_id) 750{ 751 CPUState *cpu = opaque; 752 753 /* 0x01 was CPU_INTERRUPT_EXIT. This line can be removed when the 754 version_id is increased. */ 755 cpu->interrupt_request &= ~0x01; 756 tlb_flush(cpu); 757 758 /* loadvm has just updated the content of RAM, bypassing the 759 * usual mechanisms that ensure we flush TBs for writes to 760 * memory we've translated code from. So we must flush all TBs, 761 * which will now be stale. 762 */ 763 tb_flush(cpu); 764 765 return 0; 766} 767 768static int cpu_common_pre_load(void *opaque) 769{ 770 CPUState *cpu = opaque; 771 772 cpu->exception_index = -1; 773 774 return 0; 775} 776 777static bool cpu_common_exception_index_needed(void *opaque) 778{ 779 CPUState *cpu = opaque; 780 781 return tcg_enabled() && cpu->exception_index != -1; 782} 783 784static const VMStateDescription vmstate_cpu_common_exception_index = { 785 .name = "cpu_common/exception_index", 786 .version_id = 1, 787 .minimum_version_id = 1, 788 .needed = cpu_common_exception_index_needed, 789 .fields = (VMStateField[]) { 790 VMSTATE_INT32(exception_index, CPUState), 791 VMSTATE_END_OF_LIST() 792 } 793}; 794 795static bool cpu_common_crash_occurred_needed(void *opaque) 796{ 797 CPUState *cpu = opaque; 798 799 return cpu->crash_occurred; 800} 801 802static const VMStateDescription vmstate_cpu_common_crash_occurred = { 803 .name = "cpu_common/crash_occurred", 804 .version_id = 1, 805 .minimum_version_id = 1, 806 .needed = cpu_common_crash_occurred_needed, 807 .fields = (VMStateField[]) { 808 VMSTATE_BOOL(crash_occurred, CPUState), 809 VMSTATE_END_OF_LIST() 810 } 811}; 812 813const VMStateDescription vmstate_cpu_common = { 814 .name = "cpu_common", 815 .version_id = 1, 816 .minimum_version_id = 1, 817 .pre_load = cpu_common_pre_load, 818 .post_load = cpu_common_post_load, 819 .fields = (VMStateField[]) { 820 VMSTATE_UINT32(halted, CPUState), 821 VMSTATE_UINT32(interrupt_request, CPUState), 822 VMSTATE_END_OF_LIST() 823 }, 824 .subsections = (const VMStateDescription*[]) { 825 &vmstate_cpu_common_exception_index, 826 &vmstate_cpu_common_crash_occurred, 827 NULL 828 } 829}; 830 831#endif 832 833CPUState *qemu_get_cpu(int index) 834{ 835 CPUState *cpu; 836 837 CPU_FOREACH(cpu) { 838 if (cpu->cpu_index == index) { 839 return cpu; 840 } 841 } 842 843 return NULL; 844} 845 846#if !defined(CONFIG_USER_ONLY) 847void cpu_address_space_init(CPUState *cpu, int asidx, 848 const char *prefix, MemoryRegion *mr) 849{ 850 CPUAddressSpace *newas; 851 AddressSpace *as = g_new0(AddressSpace, 1); 852 char *as_name; 853 854 assert(mr); 855 as_name = g_strdup_printf("%s-%d", prefix, cpu->cpu_index); 856 address_space_init(as, mr, as_name); 857 g_free(as_name); 858 859 /* Target code should have set num_ases before calling us */ 860 assert(asidx < cpu->num_ases); 861 862 if (asidx == 0) { 863 /* address space 0 gets the convenience alias */ 864 cpu->as = as; 865 } 866 867 /* KVM cannot currently support multiple address spaces. */ 868 assert(asidx == 0 || !kvm_enabled()); 869 870 if (!cpu->cpu_ases) { 871 cpu->cpu_ases = g_new0(CPUAddressSpace, cpu->num_ases); 872 } 873 874 newas = &cpu->cpu_ases[asidx]; 875 newas->cpu = cpu; 876 newas->as = as; 877 if (tcg_enabled()) { 878 newas->tcg_as_listener.log_global_after_sync = tcg_log_global_after_sync; 879 newas->tcg_as_listener.commit = tcg_commit; 880 memory_listener_register(&newas->tcg_as_listener, as); 881 } 882} 883 884AddressSpace *cpu_get_address_space(CPUState *cpu, int asidx) 885{ 886 /* Return the AddressSpace corresponding to the specified index */ 887 return cpu->cpu_ases[asidx].as; 888} 889#endif 890 891void cpu_exec_unrealizefn(CPUState *cpu) 892{ 893 CPUClass *cc = CPU_GET_CLASS(cpu); 894 895 cpu_list_remove(cpu); 896 897 if (cc->vmsd != NULL) { 898 vmstate_unregister(NULL, cc->vmsd, cpu); 899 } 900 if (qdev_get_vmsd(DEVICE(cpu)) == NULL) { 901 vmstate_unregister(NULL, &vmstate_cpu_common, cpu); 902 } 903#ifndef CONFIG_USER_ONLY 904 tcg_iommu_free_notifier_list(cpu); 905#endif 906} 907 908Property cpu_common_props[] = { 909#ifndef CONFIG_USER_ONLY 910 /* Create a memory property for softmmu CPU object, 911 * so users can wire up its memory. (This can't go in hw/core/cpu.c 912 * because that file is compiled only once for both user-mode 913 * and system builds.) The default if no link is set up is to use 914 * the system address space. 915 */ 916 DEFINE_PROP_LINK("memory", CPUState, memory, TYPE_MEMORY_REGION, 917 MemoryRegion *), 918#endif 919 DEFINE_PROP_END_OF_LIST(), 920}; 921 922void cpu_exec_initfn(CPUState *cpu) 923{ 924 cpu->as = NULL; 925 cpu->num_ases = 0; 926 927#ifndef CONFIG_USER_ONLY 928 cpu->thread_id = qemu_get_thread_id(); 929 cpu->memory = system_memory; 930 object_ref(OBJECT(cpu->memory)); 931#endif 932} 933 934void cpu_exec_realizefn(CPUState *cpu, Error **errp) 935{ 936 CPUClass *cc = CPU_GET_CLASS(cpu); 937 static bool tcg_target_initialized; 938 939 cpu_list_add(cpu); 940 941 if (tcg_enabled() && !tcg_target_initialized) { 942 tcg_target_initialized = true; 943 cc->tcg_initialize(); 944 } 945 tlb_init(cpu); 946 947 qemu_plugin_vcpu_init_hook(cpu); 948 949#ifndef CONFIG_USER_ONLY 950 if (qdev_get_vmsd(DEVICE(cpu)) == NULL) { 951 vmstate_register(NULL, cpu->cpu_index, &vmstate_cpu_common, cpu); 952 } 953 if (cc->vmsd != NULL) { 954 vmstate_register(NULL, cpu->cpu_index, cc->vmsd, cpu); 955 } 956 957 cpu->iommu_notifiers = g_array_new(false, true, sizeof(TCGIOMMUNotifier *)); 958#endif 959} 960 961const char *parse_cpu_option(const char *cpu_option) 962{ 963 ObjectClass *oc; 964 CPUClass *cc; 965 gchar **model_pieces; 966 const char *cpu_type; 967 968 model_pieces = g_strsplit(cpu_option, ",", 2); 969 if (!model_pieces[0]) { 970 error_report("-cpu option cannot be empty"); 971 exit(1); 972 } 973 974 oc = cpu_class_by_name(CPU_RESOLVING_TYPE, model_pieces[0]); 975 if (oc == NULL) { 976 error_report("unable to find CPU model '%s'", model_pieces[0]); 977 g_strfreev(model_pieces); 978 exit(EXIT_FAILURE); 979 } 980 981 cpu_type = object_class_get_name(oc); 982 cc = CPU_CLASS(oc); 983 cc->parse_features(cpu_type, model_pieces[1], &error_fatal); 984 g_strfreev(model_pieces); 985 return cpu_type; 986} 987 988#if defined(CONFIG_USER_ONLY) 989void tb_invalidate_phys_addr(target_ulong addr) 990{ 991 mmap_lock(); 992 tb_invalidate_phys_page_range(addr, addr + 1); 993 mmap_unlock(); 994} 995 996static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) 997{ 998 tb_invalidate_phys_addr(pc); 999} 1000#else 1001void tb_invalidate_phys_addr(AddressSpace *as, hwaddr addr, MemTxAttrs attrs) 1002{ 1003 ram_addr_t ram_addr; 1004 MemoryRegion *mr; 1005 hwaddr l = 1; 1006 1007 if (!tcg_enabled()) { 1008 return; 1009 } 1010 1011 RCU_READ_LOCK_GUARD(); 1012 mr = address_space_translate(as, addr, &addr, &l, false, attrs); 1013 if (!(memory_region_is_ram(mr) 1014 || memory_region_is_romd(mr))) { 1015 return; 1016 } 1017 ram_addr = memory_region_get_ram_addr(mr) + addr; 1018 tb_invalidate_phys_page_range(ram_addr, ram_addr + 1); 1019} 1020 1021static void breakpoint_invalidate(CPUState *cpu, target_ulong pc) 1022{ 1023 /* 1024 * There may not be a virtual to physical translation for the pc 1025 * right now, but there may exist cached TB for this pc. 1026 * Flush the whole TB cache to force re-translation of such TBs. 1027 * This is heavyweight, but we're debugging anyway. 1028 */ 1029 tb_flush(cpu); 1030} 1031#endif 1032 1033#ifndef CONFIG_USER_ONLY 1034/* Add a watchpoint. */ 1035int cpu_watchpoint_insert(CPUState *cpu, vaddr addr, vaddr len, 1036 int flags, CPUWatchpoint **watchpoint) 1037{ 1038 CPUWatchpoint *wp; 1039 1040 /* forbid ranges which are empty or run off the end of the address space */ 1041 if (len == 0 || (addr + len - 1) < addr) { 1042 error_report("tried to set invalid watchpoint at %" 1043 VADDR_PRIx ", len=%" VADDR_PRIu, addr, len); 1044 return -EINVAL; 1045 } 1046 wp = g_malloc(sizeof(*wp)); 1047 1048 wp->vaddr = addr; 1049 wp->len = len; 1050 wp->flags = flags; 1051 1052 /* keep all GDB-injected watchpoints in front */ 1053 if (flags & BP_GDB) { 1054 QTAILQ_INSERT_HEAD(&cpu->watchpoints, wp, entry); 1055 } else { 1056 QTAILQ_INSERT_TAIL(&cpu->watchpoints, wp, entry); 1057 } 1058 1059 tlb_flush_page(cpu, addr); 1060 1061 if (watchpoint) 1062 *watchpoint = wp; 1063 return 0; 1064} 1065 1066/* Remove a specific watchpoint. */ 1067int cpu_watchpoint_remove(CPUState *cpu, vaddr addr, vaddr len, 1068 int flags) 1069{ 1070 CPUWatchpoint *wp; 1071 1072 QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { 1073 if (addr == wp->vaddr && len == wp->len 1074 && flags == (wp->flags & ~BP_WATCHPOINT_HIT)) { 1075 cpu_watchpoint_remove_by_ref(cpu, wp); 1076 return 0; 1077 } 1078 } 1079 return -ENOENT; 1080} 1081 1082/* Remove a specific watchpoint by reference. */ 1083void cpu_watchpoint_remove_by_ref(CPUState *cpu, CPUWatchpoint *watchpoint) 1084{ 1085 QTAILQ_REMOVE(&cpu->watchpoints, watchpoint, entry); 1086 1087 tlb_flush_page(cpu, watchpoint->vaddr); 1088 1089 g_free(watchpoint); 1090} 1091 1092/* Remove all matching watchpoints. */ 1093void cpu_watchpoint_remove_all(CPUState *cpu, int mask) 1094{ 1095 CPUWatchpoint *wp, *next; 1096 1097 QTAILQ_FOREACH_SAFE(wp, &cpu->watchpoints, entry, next) { 1098 if (wp->flags & mask) { 1099 cpu_watchpoint_remove_by_ref(cpu, wp); 1100 } 1101 } 1102} 1103 1104/* Return true if this watchpoint address matches the specified 1105 * access (ie the address range covered by the watchpoint overlaps 1106 * partially or completely with the address range covered by the 1107 * access). 1108 */ 1109static inline bool watchpoint_address_matches(CPUWatchpoint *wp, 1110 vaddr addr, vaddr len) 1111{ 1112 /* We know the lengths are non-zero, but a little caution is 1113 * required to avoid errors in the case where the range ends 1114 * exactly at the top of the address space and so addr + len 1115 * wraps round to zero. 1116 */ 1117 vaddr wpend = wp->vaddr + wp->len - 1; 1118 vaddr addrend = addr + len - 1; 1119 1120 return !(addr > wpend || wp->vaddr > addrend); 1121} 1122 1123/* Return flags for watchpoints that match addr + prot. */ 1124int cpu_watchpoint_address_matches(CPUState *cpu, vaddr addr, vaddr len) 1125{ 1126 CPUWatchpoint *wp; 1127 int ret = 0; 1128 1129 QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { 1130 if (watchpoint_address_matches(wp, addr, TARGET_PAGE_SIZE)) { 1131 ret |= wp->flags; 1132 } 1133 } 1134 return ret; 1135} 1136#endif /* !CONFIG_USER_ONLY */ 1137 1138/* Add a breakpoint. */ 1139int cpu_breakpoint_insert(CPUState *cpu, vaddr pc, int flags, 1140 CPUBreakpoint **breakpoint) 1141{ 1142 CPUBreakpoint *bp; 1143 1144 bp = g_malloc(sizeof(*bp)); 1145 1146 bp->pc = pc; 1147 bp->flags = flags; 1148 1149 /* keep all GDB-injected breakpoints in front */ 1150 if (flags & BP_GDB) { 1151 QTAILQ_INSERT_HEAD(&cpu->breakpoints, bp, entry); 1152 } else { 1153 QTAILQ_INSERT_TAIL(&cpu->breakpoints, bp, entry); 1154 } 1155 1156 breakpoint_invalidate(cpu, pc); 1157 1158 if (breakpoint) { 1159 *breakpoint = bp; 1160 } 1161 return 0; 1162} 1163 1164/* Remove a specific breakpoint. */ 1165int cpu_breakpoint_remove(CPUState *cpu, vaddr pc, int flags) 1166{ 1167 CPUBreakpoint *bp; 1168 1169 QTAILQ_FOREACH(bp, &cpu->breakpoints, entry) { 1170 if (bp->pc == pc && bp->flags == flags) { 1171 cpu_breakpoint_remove_by_ref(cpu, bp); 1172 return 0; 1173 } 1174 } 1175 return -ENOENT; 1176} 1177 1178/* Remove a specific breakpoint by reference. */ 1179void cpu_breakpoint_remove_by_ref(CPUState *cpu, CPUBreakpoint *breakpoint) 1180{ 1181 QTAILQ_REMOVE(&cpu->breakpoints, breakpoint, entry); 1182 1183 breakpoint_invalidate(cpu, breakpoint->pc); 1184 1185 g_free(breakpoint); 1186} 1187 1188/* Remove all matching breakpoints. */ 1189void cpu_breakpoint_remove_all(CPUState *cpu, int mask) 1190{ 1191 CPUBreakpoint *bp, *next; 1192 1193 QTAILQ_FOREACH_SAFE(bp, &cpu->breakpoints, entry, next) { 1194 if (bp->flags & mask) { 1195 cpu_breakpoint_remove_by_ref(cpu, bp); 1196 } 1197 } 1198} 1199 1200/* enable or disable single step mode. EXCP_DEBUG is returned by the 1201 CPU loop after each instruction */ 1202void cpu_single_step(CPUState *cpu, int enabled) 1203{ 1204 if (cpu->singlestep_enabled != enabled) { 1205 cpu->singlestep_enabled = enabled; 1206 if (kvm_enabled()) { 1207 kvm_update_guest_debug(cpu, 0); 1208 } else { 1209 /* must flush all the translated code to avoid inconsistencies */ 1210 /* XXX: only flush what is necessary */ 1211 tb_flush(cpu); 1212 } 1213 } 1214} 1215 1216void cpu_abort(CPUState *cpu, const char *fmt, ...) 1217{ 1218 va_list ap; 1219 va_list ap2; 1220 1221 va_start(ap, fmt); 1222 va_copy(ap2, ap); 1223 fprintf(stderr, "qemu: fatal: "); 1224 vfprintf(stderr, fmt, ap); 1225 fprintf(stderr, "\n"); 1226 cpu_dump_state(cpu, stderr, CPU_DUMP_FPU | CPU_DUMP_CCOP); 1227 if (qemu_log_separate()) { 1228 FILE *logfile = qemu_log_lock(); 1229 qemu_log("qemu: fatal: "); 1230 qemu_log_vprintf(fmt, ap2); 1231 qemu_log("\n"); 1232 log_cpu_state(cpu, CPU_DUMP_FPU | CPU_DUMP_CCOP); 1233 qemu_log_flush(); 1234 qemu_log_unlock(logfile); 1235 qemu_log_close(); 1236 } 1237 va_end(ap2); 1238 va_end(ap); 1239 replay_finish(); 1240#if defined(CONFIG_USER_ONLY) 1241 { 1242 struct sigaction act; 1243 sigfillset(&act.sa_mask); 1244 act.sa_handler = SIG_DFL; 1245 act.sa_flags = 0; 1246 sigaction(SIGABRT, &act, NULL); 1247 } 1248#endif 1249 abort(); 1250} 1251 1252#if !defined(CONFIG_USER_ONLY) 1253/* Called from RCU critical section */ 1254static RAMBlock *qemu_get_ram_block(ram_addr_t addr) 1255{ 1256 RAMBlock *block; 1257 1258 block = atomic_rcu_read(&ram_list.mru_block); 1259 if (block && addr - block->offset < block->max_length) { 1260 return block; 1261 } 1262 RAMBLOCK_FOREACH(block) { 1263 if (addr - block->offset < block->max_length) { 1264 goto found; 1265 } 1266 } 1267 1268 fprintf(stderr, "Bad ram offset %" PRIx64 "\n", (uint64_t)addr); 1269 abort(); 1270 1271found: 1272 /* It is safe to write mru_block outside the iothread lock. This 1273 * is what happens: 1274 * 1275 * mru_block = xxx 1276 * rcu_read_unlock() 1277 * xxx removed from list 1278 * rcu_read_lock() 1279 * read mru_block 1280 * mru_block = NULL; 1281 * call_rcu(reclaim_ramblock, xxx); 1282 * rcu_read_unlock() 1283 * 1284 * atomic_rcu_set is not needed here. The block was already published 1285 * when it was placed into the list. Here we're just making an extra 1286 * copy of the pointer. 1287 */ 1288 ram_list.mru_block = block; 1289 return block; 1290} 1291 1292static void tlb_reset_dirty_range_all(ram_addr_t start, ram_addr_t length) 1293{ 1294 CPUState *cpu; 1295 ram_addr_t start1; 1296 RAMBlock *block; 1297 ram_addr_t end; 1298 1299 assert(tcg_enabled()); 1300 end = TARGET_PAGE_ALIGN(start + length); 1301 start &= TARGET_PAGE_MASK; 1302 1303 RCU_READ_LOCK_GUARD(); 1304 block = qemu_get_ram_block(start); 1305 assert(block == qemu_get_ram_block(end - 1)); 1306 start1 = (uintptr_t)ramblock_ptr(block, start - block->offset); 1307 CPU_FOREACH(cpu) { 1308 tlb_reset_dirty(cpu, start1, length); 1309 } 1310} 1311 1312/* Note: start and end must be within the same ram block. */ 1313bool cpu_physical_memory_test_and_clear_dirty(ram_addr_t start, 1314 ram_addr_t length, 1315 unsigned client) 1316{ 1317 DirtyMemoryBlocks *blocks; 1318 unsigned long end, page, start_page; 1319 bool dirty = false; 1320 RAMBlock *ramblock; 1321 uint64_t mr_offset, mr_size; 1322 1323 if (length == 0) { 1324 return false; 1325 } 1326 1327 end = TARGET_PAGE_ALIGN(start + length) >> TARGET_PAGE_BITS; 1328 start_page = start >> TARGET_PAGE_BITS; 1329 page = start_page; 1330 1331 WITH_RCU_READ_LOCK_GUARD() { 1332 blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); 1333 ramblock = qemu_get_ram_block(start); 1334 /* Range sanity check on the ramblock */ 1335 assert(start >= ramblock->offset && 1336 start + length <= ramblock->offset + ramblock->used_length); 1337 1338 while (page < end) { 1339 unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; 1340 unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; 1341 unsigned long num = MIN(end - page, 1342 DIRTY_MEMORY_BLOCK_SIZE - offset); 1343 1344 dirty |= bitmap_test_and_clear_atomic(blocks->blocks[idx], 1345 offset, num); 1346 page += num; 1347 } 1348 1349 mr_offset = (ram_addr_t)(start_page << TARGET_PAGE_BITS) - ramblock->offset; 1350 mr_size = (end - start_page) << TARGET_PAGE_BITS; 1351 memory_region_clear_dirty_bitmap(ramblock->mr, mr_offset, mr_size); 1352 } 1353 1354 if (dirty && tcg_enabled()) { 1355 tlb_reset_dirty_range_all(start, length); 1356 } 1357 1358 return dirty; 1359} 1360 1361DirtyBitmapSnapshot *cpu_physical_memory_snapshot_and_clear_dirty 1362 (MemoryRegion *mr, hwaddr offset, hwaddr length, unsigned client) 1363{ 1364 DirtyMemoryBlocks *blocks; 1365 ram_addr_t start = memory_region_get_ram_addr(mr) + offset; 1366 unsigned long align = 1UL << (TARGET_PAGE_BITS + BITS_PER_LEVEL); 1367 ram_addr_t first = QEMU_ALIGN_DOWN(start, align); 1368 ram_addr_t last = QEMU_ALIGN_UP(start + length, align); 1369 DirtyBitmapSnapshot *snap; 1370 unsigned long page, end, dest; 1371 1372 snap = g_malloc0(sizeof(*snap) + 1373 ((last - first) >> (TARGET_PAGE_BITS + 3))); 1374 snap->start = first; 1375 snap->end = last; 1376 1377 page = first >> TARGET_PAGE_BITS; 1378 end = last >> TARGET_PAGE_BITS; 1379 dest = 0; 1380 1381 WITH_RCU_READ_LOCK_GUARD() { 1382 blocks = atomic_rcu_read(&ram_list.dirty_memory[client]); 1383 1384 while (page < end) { 1385 unsigned long idx = page / DIRTY_MEMORY_BLOCK_SIZE; 1386 unsigned long offset = page % DIRTY_MEMORY_BLOCK_SIZE; 1387 unsigned long num = MIN(end - page, 1388 DIRTY_MEMORY_BLOCK_SIZE - offset); 1389 1390 assert(QEMU_IS_ALIGNED(offset, (1 << BITS_PER_LEVEL))); 1391 assert(QEMU_IS_ALIGNED(num, (1 << BITS_PER_LEVEL))); 1392 offset >>= BITS_PER_LEVEL; 1393 1394 bitmap_copy_and_clear_atomic(snap->dirty + dest, 1395 blocks->blocks[idx] + offset, 1396 num); 1397 page += num; 1398 dest += num >> BITS_PER_LEVEL; 1399 } 1400 } 1401 1402 if (tcg_enabled()) { 1403 tlb_reset_dirty_range_all(start, length); 1404 } 1405 1406 memory_region_clear_dirty_bitmap(mr, offset, length); 1407 1408 return snap; 1409} 1410 1411bool cpu_physical_memory_snapshot_get_dirty(DirtyBitmapSnapshot *snap, 1412 ram_addr_t start, 1413 ram_addr_t length) 1414{ 1415 unsigned long page, end; 1416 1417 assert(start >= snap->start); 1418 assert(start + length <= snap->end); 1419 1420 end = TARGET_PAGE_ALIGN(start + length - snap->start) >> TARGET_PAGE_BITS; 1421 page = (start - snap->start) >> TARGET_PAGE_BITS; 1422 1423 while (page < end) { 1424 if (test_bit(page, snap->dirty)) { 1425 return true; 1426 } 1427 page++; 1428 } 1429 return false; 1430} 1431 1432/* Called from RCU critical section */ 1433hwaddr memory_region_section_get_iotlb(CPUState *cpu, 1434 MemoryRegionSection *section) 1435{ 1436 AddressSpaceDispatch *d = flatview_to_dispatch(section->fv); 1437 return section - d->map.sections; 1438} 1439#endif /* defined(CONFIG_USER_ONLY) */ 1440 1441#if !defined(CONFIG_USER_ONLY) 1442 1443static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end, 1444 uint16_t section); 1445static subpage_t *subpage_init(FlatView *fv, hwaddr base); 1446 1447static void *(*phys_mem_alloc)(size_t size, uint64_t *align, bool shared) = 1448 qemu_anon_ram_alloc; 1449 1450/* 1451 * Set a custom physical guest memory alloator. 1452 * Accelerators with unusual needs may need this. Hopefully, we can 1453 * get rid of it eventually. 1454 */ 1455void phys_mem_set_alloc(void *(*alloc)(size_t, uint64_t *align, bool shared)) 1456{ 1457 phys_mem_alloc = alloc; 1458} 1459 1460static uint16_t phys_section_add(PhysPageMap *map, 1461 MemoryRegionSection *section) 1462{ 1463 /* The physical section number is ORed with a page-aligned 1464 * pointer to produce the iotlb entries. Thus it should 1465 * never overflow into the page-aligned value. 1466 */ 1467 assert(map->sections_nb < TARGET_PAGE_SIZE); 1468 1469 if (map->sections_nb == map->sections_nb_alloc) { 1470 map->sections_nb_alloc = MAX(map->sections_nb_alloc * 2, 16); 1471 map->sections = g_renew(MemoryRegionSection, map->sections, 1472 map->sections_nb_alloc); 1473 } 1474 map->sections[map->sections_nb] = *section; 1475 memory_region_ref(section->mr); 1476 return map->sections_nb++; 1477} 1478 1479static void phys_section_destroy(MemoryRegion *mr) 1480{ 1481 bool have_sub_page = mr->subpage; 1482 1483 memory_region_unref(mr); 1484 1485 if (have_sub_page) { 1486 subpage_t *subpage = container_of(mr, subpage_t, iomem); 1487 object_unref(OBJECT(&subpage->iomem)); 1488 g_free(subpage); 1489 } 1490} 1491 1492static void phys_sections_free(PhysPageMap *map) 1493{ 1494 while (map->sections_nb > 0) { 1495 MemoryRegionSection *section = &map->sections[--map->sections_nb]; 1496 phys_section_destroy(section->mr); 1497 } 1498 g_free(map->sections); 1499 g_free(map->nodes); 1500} 1501 1502static void register_subpage(FlatView *fv, MemoryRegionSection *section) 1503{ 1504 AddressSpaceDispatch *d = flatview_to_dispatch(fv); 1505 subpage_t *subpage; 1506 hwaddr base = section->offset_within_address_space 1507 & TARGET_PAGE_MASK; 1508 MemoryRegionSection *existing = phys_page_find(d, base); 1509 MemoryRegionSection subsection = { 1510 .offset_within_address_space = base, 1511 .size = int128_make64(TARGET_PAGE_SIZE), 1512 }; 1513 hwaddr start, end; 1514 1515 assert(existing->mr->subpage || existing->mr == &io_mem_unassigned); 1516 1517 if (!(existing->mr->subpage)) { 1518 subpage = subpage_init(fv, base); 1519 subsection.fv = fv; 1520 subsection.mr = &subpage->iomem; 1521 phys_page_set(d, base >> TARGET_PAGE_BITS, 1, 1522 phys_section_add(&d->map, &subsection)); 1523 } else { 1524 subpage = container_of(existing->mr, subpage_t, iomem); 1525 } 1526 start = section->offset_within_address_space & ~TARGET_PAGE_MASK; 1527 end = start + int128_get64(section->size) - 1; 1528 subpage_register(subpage, start, end, 1529 phys_section_add(&d->map, section)); 1530} 1531 1532 1533static void register_multipage(FlatView *fv, 1534 MemoryRegionSection *section) 1535{ 1536 AddressSpaceDispatch *d = flatview_to_dispatch(fv); 1537 hwaddr start_addr = section->offset_within_address_space; 1538 uint16_t section_index = phys_section_add(&d->map, section); 1539 uint64_t num_pages = int128_get64(int128_rshift(section->size, 1540 TARGET_PAGE_BITS)); 1541 1542 assert(num_pages); 1543 phys_page_set(d, start_addr >> TARGET_PAGE_BITS, num_pages, section_index); 1544} 1545 1546/* 1547 * The range in *section* may look like this: 1548 * 1549 * |s|PPPPPPP|s| 1550 * 1551 * where s stands for subpage and P for page. 1552 */ 1553void flatview_add_to_dispatch(FlatView *fv, MemoryRegionSection *section) 1554{ 1555 MemoryRegionSection remain = *section; 1556 Int128 page_size = int128_make64(TARGET_PAGE_SIZE); 1557 1558 /* register first subpage */ 1559 if (remain.offset_within_address_space & ~TARGET_PAGE_MASK) { 1560 uint64_t left = TARGET_PAGE_ALIGN(remain.offset_within_address_space) 1561 - remain.offset_within_address_space; 1562 1563 MemoryRegionSection now = remain; 1564 now.size = int128_min(int128_make64(left), now.size); 1565 register_subpage(fv, &now); 1566 if (int128_eq(remain.size, now.size)) { 1567 return; 1568 } 1569 remain.size = int128_sub(remain.size, now.size); 1570 remain.offset_within_address_space += int128_get64(now.size); 1571 remain.offset_within_region += int128_get64(now.size); 1572 } 1573 1574 /* register whole pages */ 1575 if (int128_ge(remain.size, page_size)) { 1576 MemoryRegionSection now = remain; 1577 now.size = int128_and(now.size, int128_neg(page_size)); 1578 register_multipage(fv, &now); 1579 if (int128_eq(remain.size, now.size)) { 1580 return; 1581 } 1582 remain.size = int128_sub(remain.size, now.size); 1583 remain.offset_within_address_space += int128_get64(now.size); 1584 remain.offset_within_region += int128_get64(now.size); 1585 } 1586 1587 /* register last subpage */ 1588 register_subpage(fv, &remain); 1589} 1590 1591void qemu_flush_coalesced_mmio_buffer(void) 1592{ 1593 if (kvm_enabled()) 1594 kvm_flush_coalesced_mmio_buffer(); 1595} 1596 1597void qemu_mutex_lock_ramlist(void) 1598{ 1599 qemu_mutex_lock(&ram_list.mutex); 1600} 1601 1602void qemu_mutex_unlock_ramlist(void) 1603{ 1604 qemu_mutex_unlock(&ram_list.mutex); 1605} 1606 1607void ram_block_dump(Monitor *mon) 1608{ 1609 RAMBlock *block; 1610 char *psize; 1611 1612 RCU_READ_LOCK_GUARD(); 1613 monitor_printf(mon, "%24s %8s %18s %18s %18s\n", 1614 "Block Name", "PSize", "Offset", "Used", "Total"); 1615 RAMBLOCK_FOREACH(block) { 1616 psize = size_to_str(block->page_size); 1617 monitor_printf(mon, "%24s %8s 0x%016" PRIx64 " 0x%016" PRIx64 1618 " 0x%016" PRIx64 "\n", block->idstr, psize, 1619 (uint64_t)block->offset, 1620 (uint64_t)block->used_length, 1621 (uint64_t)block->max_length); 1622 g_free(psize); 1623 } 1624} 1625 1626#ifdef __linux__ 1627/* 1628 * FIXME TOCTTOU: this iterates over memory backends' mem-path, which 1629 * may or may not name the same files / on the same filesystem now as 1630 * when we actually open and map them. Iterate over the file 1631 * descriptors instead, and use qemu_fd_getpagesize(). 1632 */ 1633static int find_min_backend_pagesize(Object *obj, void *opaque) 1634{ 1635 long *hpsize_min = opaque; 1636 1637 if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { 1638 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 1639 long hpsize = host_memory_backend_pagesize(backend); 1640 1641 if (host_memory_backend_is_mapped(backend) && (hpsize < *hpsize_min)) { 1642 *hpsize_min = hpsize; 1643 } 1644 } 1645 1646 return 0; 1647} 1648 1649static int find_max_backend_pagesize(Object *obj, void *opaque) 1650{ 1651 long *hpsize_max = opaque; 1652 1653 if (object_dynamic_cast(obj, TYPE_MEMORY_BACKEND)) { 1654 HostMemoryBackend *backend = MEMORY_BACKEND(obj); 1655 long hpsize = host_memory_backend_pagesize(backend); 1656 1657 if (host_memory_backend_is_mapped(backend) && (hpsize > *hpsize_max)) { 1658 *hpsize_max = hpsize; 1659 } 1660 } 1661 1662 return 0; 1663} 1664 1665/* 1666 * TODO: We assume right now that all mapped host memory backends are 1667 * used as RAM, however some might be used for different purposes. 1668 */ 1669long qemu_minrampagesize(void) 1670{ 1671 long hpsize = LONG_MAX; 1672 Object *memdev_root = object_resolve_path("/objects", NULL); 1673 1674 object_child_foreach(memdev_root, find_min_backend_pagesize, &hpsize); 1675 return hpsize; 1676} 1677 1678long qemu_maxrampagesize(void) 1679{ 1680 long pagesize = 0; 1681 Object *memdev_root = object_resolve_path("/objects", NULL); 1682 1683 object_child_foreach(memdev_root, find_max_backend_pagesize, &pagesize); 1684 return pagesize; 1685} 1686#else 1687long qemu_minrampagesize(void) 1688{ 1689 return qemu_real_host_page_size; 1690} 1691long qemu_maxrampagesize(void) 1692{ 1693 return qemu_real_host_page_size; 1694} 1695#endif 1696 1697#ifdef CONFIG_POSIX 1698static int64_t get_file_size(int fd) 1699{ 1700 int64_t size; 1701#if defined(__linux__) 1702 struct stat st; 1703 1704 if (fstat(fd, &st) < 0) { 1705 return -errno; 1706 } 1707 1708 /* Special handling for devdax character devices */ 1709 if (S_ISCHR(st.st_mode)) { 1710 g_autofree char *subsystem_path = NULL; 1711 g_autofree char *subsystem = NULL; 1712 1713 subsystem_path = g_strdup_printf("/sys/dev/char/%d:%d/subsystem", 1714 major(st.st_rdev), minor(st.st_rdev)); 1715 subsystem = g_file_read_link(subsystem_path, NULL); 1716 1717 if (subsystem && g_str_has_suffix(subsystem, "/dax")) { 1718 g_autofree char *size_path = NULL; 1719 g_autofree char *size_str = NULL; 1720 1721 size_path = g_strdup_printf("/sys/dev/char/%d:%d/size", 1722 major(st.st_rdev), minor(st.st_rdev)); 1723 1724 if (g_file_get_contents(size_path, &size_str, NULL, NULL)) { 1725 return g_ascii_strtoll(size_str, NULL, 0); 1726 } 1727 } 1728 } 1729#endif /* defined(__linux__) */ 1730 1731 /* st.st_size may be zero for special files yet lseek(2) works */ 1732 size = lseek(fd, 0, SEEK_END); 1733 if (size < 0) { 1734 return -errno; 1735 } 1736 return size; 1737} 1738 1739static int file_ram_open(const char *path, 1740 const char *region_name, 1741 bool *created, 1742 Error **errp) 1743{ 1744 char *filename; 1745 char *sanitized_name; 1746 char *c; 1747 int fd = -1; 1748 1749 *created = false; 1750 for (;;) { 1751 fd = open(path, O_RDWR); 1752 if (fd >= 0) { 1753 /* @path names an existing file, use it */ 1754 break; 1755 } 1756 if (errno == ENOENT) { 1757 /* @path names a file that doesn't exist, create it */ 1758 fd = open(path, O_RDWR | O_CREAT | O_EXCL, 0644); 1759 if (fd >= 0) { 1760 *created = true; 1761 break; 1762 } 1763 } else if (errno == EISDIR) { 1764 /* @path names a directory, create a file there */ 1765 /* Make name safe to use with mkstemp by replacing '/' with '_'. */ 1766 sanitized_name = g_strdup(region_name); 1767 for (c = sanitized_name; *c != '\0'; c++) { 1768 if (*c == '/') { 1769 *c = '_'; 1770 } 1771 } 1772 1773 filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path, 1774 sanitized_name); 1775 g_free(sanitized_name); 1776 1777 fd = mkstemp(filename); 1778 if (fd >= 0) { 1779 unlink(filename); 1780 g_free(filename); 1781 break; 1782 } 1783 g_free(filename); 1784 } 1785 if (errno != EEXIST && errno != EINTR) { 1786 error_setg_errno(errp, errno, 1787 "can't open backing store %s for guest RAM", 1788 path); 1789 return -1; 1790 } 1791 /* 1792 * Try again on EINTR and EEXIST. The latter happens when 1793 * something else creates the file between our two open(). 1794 */ 1795 } 1796 1797 return fd; 1798} 1799 1800static void *file_ram_alloc(RAMBlock *block, 1801 ram_addr_t memory, 1802 int fd, 1803 bool truncate, 1804 Error **errp) 1805{ 1806 void *area; 1807 1808 block->page_size = qemu_fd_getpagesize(fd); 1809 if (block->mr->align % block->page_size) { 1810 error_setg(errp, "alignment 0x%" PRIx64 1811 " must be multiples of page size 0x%zx", 1812 block->mr->align, block->page_size); 1813 return NULL; 1814 } else if (block->mr->align && !is_power_of_2(block->mr->align)) { 1815 error_setg(errp, "alignment 0x%" PRIx64 1816 " must be a power of two", block->mr->align); 1817 return NULL; 1818 } 1819 block->mr->align = MAX(block->page_size, block->mr->align); 1820#if defined(__s390x__) 1821 if (kvm_enabled()) { 1822 block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN); 1823 } 1824#endif 1825 1826 if (memory < block->page_size) { 1827 error_setg(errp, "memory size 0x" RAM_ADDR_FMT " must be equal to " 1828 "or larger than page size 0x%zx", 1829 memory, block->page_size); 1830 return NULL; 1831 } 1832 1833 memory = ROUND_UP(memory, block->page_size); 1834 1835 /* 1836 * ftruncate is not supported by hugetlbfs in older 1837 * hosts, so don't bother bailing out on errors. 1838 * If anything goes wrong with it under other filesystems, 1839 * mmap will fail. 1840 * 1841 * Do not truncate the non-empty backend file to avoid corrupting 1842 * the existing data in the file. Disabling shrinking is not 1843 * enough. For example, the current vNVDIMM implementation stores 1844 * the guest NVDIMM labels at the end of the backend file. If the 1845 * backend file is later extended, QEMU will not be able to find 1846 * those labels. Therefore, extending the non-empty backend file 1847 * is disabled as well. 1848 */ 1849 if (truncate && ftruncate(fd, memory)) { 1850 perror("ftruncate"); 1851 } 1852 1853 area = qemu_ram_mmap(fd, memory, block->mr->align, 1854 block->flags & RAM_SHARED, block->flags & RAM_PMEM); 1855 if (area == MAP_FAILED) { 1856 error_setg_errno(errp, errno, 1857 "unable to map backing store for guest RAM"); 1858 return NULL; 1859 } 1860 1861 block->fd = fd; 1862 return area; 1863} 1864#endif 1865 1866/* Allocate space within the ram_addr_t space that governs the 1867 * dirty bitmaps. 1868 * Called with the ramlist lock held. 1869 */ 1870static ram_addr_t find_ram_offset(ram_addr_t size) 1871{ 1872 RAMBlock *block, *next_block; 1873 ram_addr_t offset = RAM_ADDR_MAX, mingap = RAM_ADDR_MAX; 1874 1875 assert(size != 0); /* it would hand out same offset multiple times */ 1876 1877 if (QLIST_EMPTY_RCU(&ram_list.blocks)) { 1878 return 0; 1879 } 1880 1881 RAMBLOCK_FOREACH(block) { 1882 ram_addr_t candidate, next = RAM_ADDR_MAX; 1883 1884 /* Align blocks to start on a 'long' in the bitmap 1885 * which makes the bitmap sync'ing take the fast path. 1886 */ 1887 candidate = block->offset + block->max_length; 1888 candidate = ROUND_UP(candidate, BITS_PER_LONG << TARGET_PAGE_BITS); 1889 1890 /* Search for the closest following block 1891 * and find the gap. 1892 */ 1893 RAMBLOCK_FOREACH(next_block) { 1894 if (next_block->offset >= candidate) { 1895 next = MIN(next, next_block->offset); 1896 } 1897 } 1898 1899 /* If it fits remember our place and remember the size 1900 * of gap, but keep going so that we might find a smaller 1901 * gap to fill so avoiding fragmentation. 1902 */ 1903 if (next - candidate >= size && next - candidate < mingap) { 1904 offset = candidate; 1905 mingap = next - candidate; 1906 } 1907 1908 trace_find_ram_offset_loop(size, candidate, offset, next, mingap); 1909 } 1910 1911 if (offset == RAM_ADDR_MAX) { 1912 fprintf(stderr, "Failed to find gap of requested size: %" PRIu64 "\n", 1913 (uint64_t)size); 1914 abort(); 1915 } 1916 1917 trace_find_ram_offset(size, offset); 1918 1919 return offset; 1920} 1921 1922static unsigned long last_ram_page(void) 1923{ 1924 RAMBlock *block; 1925 ram_addr_t last = 0; 1926 1927 RCU_READ_LOCK_GUARD(); 1928 RAMBLOCK_FOREACH(block) { 1929 last = MAX(last, block->offset + block->max_length); 1930 } 1931 return last >> TARGET_PAGE_BITS; 1932} 1933 1934static void qemu_ram_setup_dump(void *addr, ram_addr_t size) 1935{ 1936 int ret; 1937 1938 /* Use MADV_DONTDUMP, if user doesn't want the guest memory in the core */ 1939 if (!machine_dump_guest_core(current_machine)) { 1940 ret = qemu_madvise(addr, size, QEMU_MADV_DONTDUMP); 1941 if (ret) { 1942 perror("qemu_madvise"); 1943 fprintf(stderr, "madvise doesn't support MADV_DONTDUMP, " 1944 "but dump_guest_core=off specified\n"); 1945 } 1946 } 1947} 1948 1949const char *qemu_ram_get_idstr(RAMBlock *rb) 1950{ 1951 return rb->idstr; 1952} 1953 1954void *qemu_ram_get_host_addr(RAMBlock *rb) 1955{ 1956 return rb->host; 1957} 1958 1959ram_addr_t qemu_ram_get_offset(RAMBlock *rb) 1960{ 1961 return rb->offset; 1962} 1963 1964ram_addr_t qemu_ram_get_used_length(RAMBlock *rb) 1965{ 1966 return rb->used_length; 1967} 1968 1969bool qemu_ram_is_shared(RAMBlock *rb) 1970{ 1971 return rb->flags & RAM_SHARED; 1972} 1973 1974/* Note: Only set at the start of postcopy */ 1975bool qemu_ram_is_uf_zeroable(RAMBlock *rb) 1976{ 1977 return rb->flags & RAM_UF_ZEROPAGE; 1978} 1979 1980void qemu_ram_set_uf_zeroable(RAMBlock *rb) 1981{ 1982 rb->flags |= RAM_UF_ZEROPAGE; 1983} 1984 1985bool qemu_ram_is_migratable(RAMBlock *rb) 1986{ 1987 return rb->flags & RAM_MIGRATABLE; 1988} 1989 1990void qemu_ram_set_migratable(RAMBlock *rb) 1991{ 1992 rb->flags |= RAM_MIGRATABLE; 1993} 1994 1995void qemu_ram_unset_migratable(RAMBlock *rb) 1996{ 1997 rb->flags &= ~RAM_MIGRATABLE; 1998} 1999 2000/* Called with iothread lock held. */ 2001void qemu_ram_set_idstr(RAMBlock *new_block, const char *name, DeviceState *dev) 2002{ 2003 RAMBlock *block; 2004 2005 assert(new_block); 2006 assert(!new_block->idstr[0]); 2007 2008 if (dev) { 2009 char *id = qdev_get_dev_path(dev); 2010 if (id) { 2011 snprintf(new_block->idstr, sizeof(new_block->idstr), "%s/", id); 2012 g_free(id); 2013 } 2014 } 2015 pstrcat(new_block->idstr, sizeof(new_block->idstr), name); 2016 2017 RCU_READ_LOCK_GUARD(); 2018 RAMBLOCK_FOREACH(block) { 2019 if (block != new_block && 2020 !strcmp(block->idstr, new_block->idstr)) { 2021 fprintf(stderr, "RAMBlock \"%s\" already registered, abort!\n", 2022 new_block->idstr); 2023 abort(); 2024 } 2025 } 2026} 2027 2028/* Called with iothread lock held. */ 2029void qemu_ram_unset_idstr(RAMBlock *block) 2030{ 2031 /* FIXME: arch_init.c assumes that this is not called throughout 2032 * migration. Ignore the problem since hot-unplug during migration 2033 * does not work anyway. 2034 */ 2035 if (block) { 2036 memset(block->idstr, 0, sizeof(block->idstr)); 2037 } 2038} 2039 2040size_t qemu_ram_pagesize(RAMBlock *rb) 2041{ 2042 return rb->page_size; 2043} 2044 2045/* Returns the largest size of page in use */ 2046size_t qemu_ram_pagesize_largest(void) 2047{ 2048 RAMBlock *block; 2049 size_t largest = 0; 2050 2051 RAMBLOCK_FOREACH(block) { 2052 largest = MAX(largest, qemu_ram_pagesize(block)); 2053 } 2054 2055 return largest; 2056} 2057 2058static int memory_try_enable_merging(void *addr, size_t len) 2059{ 2060 if (!machine_mem_merge(current_machine)) { 2061 /* disabled by the user */ 2062 return 0; 2063 } 2064 2065 return qemu_madvise(addr, len, QEMU_MADV_MERGEABLE); 2066} 2067 2068/* Only legal before guest might have detected the memory size: e.g. on 2069 * incoming migration, or right after reset. 2070 * 2071 * As memory core doesn't know how is memory accessed, it is up to 2072 * resize callback to update device state and/or add assertions to detect 2073 * misuse, if necessary. 2074 */ 2075int qemu_ram_resize(RAMBlock *block, ram_addr_t newsize, Error **errp) 2076{ 2077 const ram_addr_t unaligned_size = newsize; 2078 2079 assert(block); 2080 2081 newsize = HOST_PAGE_ALIGN(newsize); 2082 2083 if (block->used_length == newsize) { 2084 /* 2085 * We don't have to resize the ram block (which only knows aligned 2086 * sizes), however, we have to notify if the unaligned size changed. 2087 */ 2088 if (unaligned_size != memory_region_size(block->mr)) { 2089 memory_region_set_size(block->mr, unaligned_size); 2090 if (block->resized) { 2091 block->resized(block->idstr, unaligned_size, block->host); 2092 } 2093 } 2094 return 0; 2095 } 2096 2097 if (!(block->flags & RAM_RESIZEABLE)) { 2098 error_setg_errno(errp, EINVAL, 2099 "Length mismatch: %s: 0x" RAM_ADDR_FMT 2100 " in != 0x" RAM_ADDR_FMT, block->idstr, 2101 newsize, block->used_length); 2102 return -EINVAL; 2103 } 2104 2105 if (block->max_length < newsize) { 2106 error_setg_errno(errp, EINVAL, 2107 "Length too large: %s: 0x" RAM_ADDR_FMT 2108 " > 0x" RAM_ADDR_FMT, block->idstr, 2109 newsize, block->max_length); 2110 return -EINVAL; 2111 } 2112 2113 cpu_physical_memory_clear_dirty_range(block->offset, block->used_length); 2114 block->used_length = newsize; 2115 cpu_physical_memory_set_dirty_range(block->offset, block->used_length, 2116 DIRTY_CLIENTS_ALL); 2117 memory_region_set_size(block->mr, unaligned_size); 2118 if (block->resized) { 2119 block->resized(block->idstr, unaligned_size, block->host); 2120 } 2121 return 0; 2122} 2123 2124/* 2125 * Trigger sync on the given ram block for range [start, start + length] 2126 * with the backing store if one is available. 2127 * Otherwise no-op. 2128 * @Note: this is supposed to be a synchronous op. 2129 */ 2130void qemu_ram_writeback(RAMBlock *block, ram_addr_t start, ram_addr_t length) 2131{ 2132 /* The requested range should fit in within the block range */ 2133 g_assert((start + length) <= block->used_length); 2134 2135#ifdef CONFIG_LIBPMEM 2136 /* The lack of support for pmem should not block the sync */ 2137 if (ramblock_is_pmem(block)) { 2138 void *addr = ramblock_ptr(block, start); 2139 pmem_persist(addr, length); 2140 return; 2141 } 2142#endif 2143 if (block->fd >= 0) { 2144 /** 2145 * Case there is no support for PMEM or the memory has not been 2146 * specified as persistent (or is not one) - use the msync. 2147 * Less optimal but still achieves the same goal 2148 */ 2149 void *addr = ramblock_ptr(block, start); 2150 if (qemu_msync(addr, length, block->fd)) { 2151 warn_report("%s: failed to sync memory range: start: " 2152 RAM_ADDR_FMT " length: " RAM_ADDR_FMT, 2153 __func__, start, length); 2154 } 2155 } 2156} 2157 2158/* Called with ram_list.mutex held */ 2159static void dirty_memory_extend(ram_addr_t old_ram_size, 2160 ram_addr_t new_ram_size) 2161{ 2162 ram_addr_t old_num_blocks = DIV_ROUND_UP(old_ram_size, 2163 DIRTY_MEMORY_BLOCK_SIZE); 2164 ram_addr_t new_num_blocks = DIV_ROUND_UP(new_ram_size, 2165 DIRTY_MEMORY_BLOCK_SIZE); 2166 int i; 2167 2168 /* Only need to extend if block count increased */ 2169 if (new_num_blocks <= old_num_blocks) { 2170 return; 2171 } 2172 2173 for (i = 0; i < DIRTY_MEMORY_NUM; i++) { 2174 DirtyMemoryBlocks *old_blocks; 2175 DirtyMemoryBlocks *new_blocks; 2176 int j; 2177 2178 old_blocks = atomic_rcu_read(&ram_list.dirty_memory[i]); 2179 new_blocks = g_malloc(sizeof(*new_blocks) + 2180 sizeof(new_blocks->blocks[0]) * new_num_blocks); 2181 2182 if (old_num_blocks) { 2183 memcpy(new_blocks->blocks, old_blocks->blocks, 2184 old_num_blocks * sizeof(old_blocks->blocks[0])); 2185 } 2186 2187 for (j = old_num_blocks; j < new_num_blocks; j++) { 2188 new_blocks->blocks[j] = bitmap_new(DIRTY_MEMORY_BLOCK_SIZE); 2189 } 2190 2191 atomic_rcu_set(&ram_list.dirty_memory[i], new_blocks); 2192 2193 if (old_blocks) { 2194 g_free_rcu(old_blocks, rcu); 2195 } 2196 } 2197} 2198 2199static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared) 2200{ 2201 RAMBlock *block; 2202 RAMBlock *last_block = NULL; 2203 ram_addr_t old_ram_size, new_ram_size; 2204 Error *err = NULL; 2205 2206 old_ram_size = last_ram_page(); 2207 2208 qemu_mutex_lock_ramlist(); 2209 new_block->offset = find_ram_offset(new_block->max_length); 2210 2211 if (!new_block->host) { 2212 if (xen_enabled()) { 2213 xen_ram_alloc(new_block->offset, new_block->max_length, 2214 new_block->mr, &err); 2215 if (err) { 2216 error_propagate(errp, err); 2217 qemu_mutex_unlock_ramlist(); 2218 return; 2219 } 2220 } else { 2221 new_block->host = phys_mem_alloc(new_block->max_length, 2222 &new_block->mr->align, shared); 2223 if (!new_block->host) { 2224 error_setg_errno(errp, errno, 2225 "cannot set up guest memory '%s'", 2226 memory_region_name(new_block->mr)); 2227 qemu_mutex_unlock_ramlist(); 2228 return; 2229 } 2230 memory_try_enable_merging(new_block->host, new_block->max_length); 2231 } 2232 } 2233 2234 new_ram_size = MAX(old_ram_size, 2235 (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); 2236 if (new_ram_size > old_ram_size) { 2237 dirty_memory_extend(old_ram_size, new_ram_size); 2238 } 2239 /* Keep the list sorted from biggest to smallest block. Unlike QTAILQ, 2240 * QLIST (which has an RCU-friendly variant) does not have insertion at 2241 * tail, so save the last element in last_block. 2242 */ 2243 RAMBLOCK_FOREACH(block) { 2244 last_block = block; 2245 if (block->max_length < new_block->max_length) { 2246 break; 2247 } 2248 } 2249 if (block) { 2250 QLIST_INSERT_BEFORE_RCU(block, new_block, next); 2251 } else if (last_block) { 2252 QLIST_INSERT_AFTER_RCU(last_block, new_block, next); 2253 } else { /* list is empty */ 2254 QLIST_INSERT_HEAD_RCU(&ram_list.blocks, new_block, next); 2255 } 2256 ram_list.mru_block = NULL; 2257 2258 /* Write list before version */ 2259 smp_wmb(); 2260 ram_list.version++; 2261 qemu_mutex_unlock_ramlist(); 2262 2263 cpu_physical_memory_set_dirty_range(new_block->offset, 2264 new_block->used_length, 2265 DIRTY_CLIENTS_ALL); 2266 2267 if (new_block->host) { 2268 qemu_ram_setup_dump(new_block->host, new_block->max_length); 2269 qemu_madvise(new_block->host, new_block->max_length, QEMU_MADV_HUGEPAGE); 2270 /* 2271 * MADV_DONTFORK is also needed by KVM in absence of synchronous MMU 2272 * Configure it unless the machine is a qtest server, in which case 2273 * KVM is not used and it may be forked (eg for fuzzing purposes). 2274 */ 2275 if (!qtest_enabled()) { 2276 qemu_madvise(new_block->host, new_block->max_length, 2277 QEMU_MADV_DONTFORK); 2278 } 2279 ram_block_notify_add(new_block->host, new_block->max_length); 2280 } 2281} 2282 2283#ifdef CONFIG_POSIX 2284RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, 2285 uint32_t ram_flags, int fd, 2286 Error **errp) 2287{ 2288 RAMBlock *new_block; 2289 Error *local_err = NULL; 2290 int64_t file_size; 2291 2292 /* Just support these ram flags by now. */ 2293 assert((ram_flags & ~(RAM_SHARED | RAM_PMEM)) == 0); 2294 2295 if (xen_enabled()) { 2296 error_setg(errp, "-mem-path not supported with Xen"); 2297 return NULL; 2298 } 2299 2300 if (kvm_enabled() && !kvm_has_sync_mmu()) { 2301 error_setg(errp, 2302 "host lacks kvm mmu notifiers, -mem-path unsupported"); 2303 return NULL; 2304 } 2305 2306 if (phys_mem_alloc != qemu_anon_ram_alloc) { 2307 /* 2308 * file_ram_alloc() needs to allocate just like 2309 * phys_mem_alloc, but we haven't bothered to provide 2310 * a hook there. 2311 */ 2312 error_setg(errp, 2313 "-mem-path not supported with this accelerator"); 2314 return NULL; 2315 } 2316 2317 size = HOST_PAGE_ALIGN(size); 2318 file_size = get_file_size(fd); 2319 if (file_size > 0 && file_size < size) { 2320 error_setg(errp, "backing store size 0x%" PRIx64 2321 " does not match 'size' option 0x" RAM_ADDR_FMT, 2322 file_size, size); 2323 return NULL; 2324 } 2325 2326 new_block = g_malloc0(sizeof(*new_block)); 2327 new_block->mr = mr; 2328 new_block->used_length = size; 2329 new_block->max_length = size; 2330 new_block->flags = ram_flags; 2331 new_block->host = file_ram_alloc(new_block, size, fd, !file_size, errp); 2332 if (!new_block->host) { 2333 g_free(new_block); 2334 return NULL; 2335 } 2336 2337 ram_block_add(new_block, &local_err, ram_flags & RAM_SHARED); 2338 if (local_err) { 2339 g_free(new_block); 2340 error_propagate(errp, local_err); 2341 return NULL; 2342 } 2343 return new_block; 2344 2345} 2346 2347 2348RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr, 2349 uint32_t ram_flags, const char *mem_path, 2350 Error **errp) 2351{ 2352 int fd; 2353 bool created; 2354 RAMBlock *block; 2355 2356 fd = file_ram_open(mem_path, memory_region_name(mr), &created, errp); 2357 if (fd < 0) { 2358 return NULL; 2359 } 2360 2361 block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, errp); 2362 if (!block) { 2363 if (created) { 2364 unlink(mem_path); 2365 } 2366 close(fd); 2367 return NULL; 2368 } 2369 2370 return block; 2371} 2372#endif 2373 2374static 2375RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, 2376 void (*resized)(const char*, 2377 uint64_t length, 2378 void *host), 2379 void *host, bool resizeable, bool share, 2380 MemoryRegion *mr, Error **errp) 2381{ 2382 RAMBlock *new_block; 2383 Error *local_err = NULL; 2384 2385 size = HOST_PAGE_ALIGN(size); 2386 max_size = HOST_PAGE_ALIGN(max_size); 2387 new_block = g_malloc0(sizeof(*new_block)); 2388 new_block->mr = mr; 2389 new_block->resized = resized; 2390 new_block->used_length = size; 2391 new_block->max_length = max_size; 2392 assert(max_size >= size); 2393 new_block->fd = -1; 2394 new_block->page_size = qemu_real_host_page_size; 2395 new_block->host = host; 2396 if (host) { 2397 new_block->flags |= RAM_PREALLOC; 2398 } 2399 if (resizeable) { 2400 new_block->flags |= RAM_RESIZEABLE; 2401 } 2402 ram_block_add(new_block, &local_err, share); 2403 if (local_err) { 2404 g_free(new_block); 2405 error_propagate(errp, local_err); 2406 return NULL; 2407 } 2408 return new_block; 2409} 2410 2411RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, 2412 MemoryRegion *mr, Error **errp) 2413{ 2414 return qemu_ram_alloc_internal(size, size, NULL, host, false, 2415 false, mr, errp); 2416} 2417 2418RAMBlock *qemu_ram_alloc(ram_addr_t size, bool share, 2419 MemoryRegion *mr, Error **errp) 2420{ 2421 return qemu_ram_alloc_internal(size, size, NULL, NULL, false, 2422 share, mr, errp); 2423} 2424 2425RAMBlock *qemu_ram_alloc_resizeable(ram_addr_t size, ram_addr_t maxsz, 2426 void (*resized)(const char*, 2427 uint64_t length, 2428 void *host), 2429 MemoryRegion *mr, Error **errp) 2430{ 2431 return qemu_ram_alloc_internal(size, maxsz, resized, NULL, true, 2432 false, mr, errp); 2433} 2434 2435static void reclaim_ramblock(RAMBlock *block) 2436{ 2437 if (block->flags & RAM_PREALLOC) { 2438 ; 2439 } else if (xen_enabled()) { 2440 xen_invalidate_map_cache_entry(block->host); 2441#ifndef _WIN32 2442 } else if (block->fd >= 0) { 2443 qemu_ram_munmap(block->fd, block->host, block->max_length); 2444 close(block->fd); 2445#endif 2446 } else { 2447 qemu_anon_ram_free(block->host, block->max_length); 2448 } 2449 g_free(block); 2450} 2451 2452void qemu_ram_free(RAMBlock *block) 2453{ 2454 if (!block) { 2455 return; 2456 } 2457 2458 if (block->host) { 2459 ram_block_notify_remove(block->host, block->max_length); 2460 } 2461 2462 qemu_mutex_lock_ramlist(); 2463 QLIST_REMOVE_RCU(block, next); 2464 ram_list.mru_block = NULL; 2465 /* Write list before version */ 2466 smp_wmb(); 2467 ram_list.version++; 2468 call_rcu(block, reclaim_ramblock, rcu); 2469 qemu_mutex_unlock_ramlist(); 2470} 2471 2472#ifndef _WIN32 2473void qemu_ram_remap(ram_addr_t addr, ram_addr_t length) 2474{ 2475 RAMBlock *block; 2476 ram_addr_t offset; 2477 int flags; 2478 void *area, *vaddr; 2479 2480 RAMBLOCK_FOREACH(block) { 2481 offset = addr - block->offset; 2482 if (offset < block->max_length) { 2483 vaddr = ramblock_ptr(block, offset); 2484 if (block->flags & RAM_PREALLOC) { 2485 ; 2486 } else if (xen_enabled()) { 2487 abort(); 2488 } else { 2489 flags = MAP_FIXED; 2490 if (block->fd >= 0) { 2491 flags |= (block->flags & RAM_SHARED ? 2492 MAP_SHARED : MAP_PRIVATE); 2493 area = mmap(vaddr, length, PROT_READ | PROT_WRITE, 2494 flags, block->fd, offset); 2495 } else { 2496 /* 2497 * Remap needs to match alloc. Accelerators that 2498 * set phys_mem_alloc never remap. If they did, 2499 * we'd need a remap hook here. 2500 */ 2501 assert(phys_mem_alloc == qemu_anon_ram_alloc); 2502 2503 flags |= MAP_PRIVATE | MAP_ANONYMOUS; 2504 area = mmap(vaddr, length, PROT_READ | PROT_WRITE, 2505 flags, -1, 0); 2506 } 2507 if (area != vaddr) { 2508 error_report("Could not remap addr: " 2509 RAM_ADDR_FMT "@" RAM_ADDR_FMT "", 2510 length, addr); 2511 exit(1); 2512 } 2513 memory_try_enable_merging(vaddr, length); 2514 qemu_ram_setup_dump(vaddr, length); 2515 } 2516 } 2517 } 2518} 2519#endif /* !_WIN32 */ 2520 2521/* Return a host pointer to ram allocated with qemu_ram_alloc. 2522 * This should not be used for general purpose DMA. Use address_space_map 2523 * or address_space_rw instead. For local memory (e.g. video ram) that the 2524 * device owns, use memory_region_get_ram_ptr. 2525 * 2526 * Called within RCU critical section. 2527 */ 2528void *qemu_map_ram_ptr(RAMBlock *ram_block, ram_addr_t addr) 2529{ 2530 RAMBlock *block = ram_block; 2531 2532 if (block == NULL) { 2533 block = qemu_get_ram_block(addr); 2534 addr -= block->offset; 2535 } 2536 2537 if (xen_enabled() && block->host == NULL) { 2538 /* We need to check if the requested address is in the RAM 2539 * because we don't want to map the entire memory in QEMU. 2540 * In that case just map until the end of the page. 2541 */ 2542 if (block->offset == 0) { 2543 return xen_map_cache(addr, 0, 0, false); 2544 } 2545 2546 block->host = xen_map_cache(block->offset, block->max_length, 1, false); 2547 } 2548 return ramblock_ptr(block, addr); 2549} 2550 2551/* Return a host pointer to guest's ram. Similar to qemu_map_ram_ptr 2552 * but takes a size argument. 2553 * 2554 * Called within RCU critical section. 2555 */ 2556static void *qemu_ram_ptr_length(RAMBlock *ram_block, ram_addr_t addr, 2557 hwaddr *size, bool lock) 2558{ 2559 RAMBlock *block = ram_block; 2560 if (*size == 0) { 2561 return NULL; 2562 } 2563 2564 if (block == NULL) { 2565 block = qemu_get_ram_block(addr); 2566 addr -= block->offset; 2567 } 2568 *size = MIN(*size, block->max_length - addr); 2569 2570 if (xen_enabled() && block->host == NULL) { 2571 /* We need to check if the requested address is in the RAM 2572 * because we don't want to map the entire memory in QEMU. 2573 * In that case just map the requested area. 2574 */ 2575 if (block->offset == 0) { 2576 return xen_map_cache(addr, *size, lock, lock); 2577 } 2578 2579 block->host = xen_map_cache(block->offset, block->max_length, 1, lock); 2580 } 2581 2582 return ramblock_ptr(block, addr); 2583} 2584 2585/* Return the offset of a hostpointer within a ramblock */ 2586ram_addr_t qemu_ram_block_host_offset(RAMBlock *rb, void *host) 2587{ 2588 ram_addr_t res = (uint8_t *)host - (uint8_t *)rb->host; 2589 assert((uintptr_t)host >= (uintptr_t)rb->host); 2590 assert(res < rb->max_length); 2591 2592 return res; 2593} 2594 2595/* 2596 * Translates a host ptr back to a RAMBlock, a ram_addr and an offset 2597 * in that RAMBlock. 2598 * 2599 * ptr: Host pointer to look up 2600 * round_offset: If true round the result offset down to a page boundary 2601 * *ram_addr: set to result ram_addr 2602 * *offset: set to result offset within the RAMBlock 2603 * 2604 * Returns: RAMBlock (or NULL if not found) 2605 * 2606 * By the time this function returns, the returned pointer is not protected 2607 * by RCU anymore. If the caller is not within an RCU critical section and 2608 * does not hold the iothread lock, it must have other means of protecting the 2609 * pointer, such as a reference to the region that includes the incoming 2610 * ram_addr_t. 2611 */ 2612RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset, 2613 ram_addr_t *offset) 2614{ 2615 RAMBlock *block; 2616 uint8_t *host = ptr; 2617 2618 if (xen_enabled()) { 2619 ram_addr_t ram_addr; 2620 RCU_READ_LOCK_GUARD(); 2621 ram_addr = xen_ram_addr_from_mapcache(ptr); 2622 block = qemu_get_ram_block(ram_addr); 2623 if (block) { 2624 *offset = ram_addr - block->offset; 2625 } 2626 return block; 2627 } 2628 2629 RCU_READ_LOCK_GUARD(); 2630 block = atomic_rcu_read(&ram_list.mru_block); 2631 if (block && block->host && host - block->host < block->max_length) { 2632 goto found; 2633 } 2634 2635 RAMBLOCK_FOREACH(block) { 2636 /* This case append when the block is not mapped. */ 2637 if (block->host == NULL) { 2638 continue; 2639 } 2640 if (host - block->host < block->max_length) { 2641 goto found; 2642 } 2643 } 2644 2645 return NULL; 2646 2647found: 2648 *offset = (host - block->host); 2649 if (round_offset) { 2650 *offset &= TARGET_PAGE_MASK; 2651 } 2652 return block; 2653} 2654 2655/* 2656 * Finds the named RAMBlock 2657 * 2658 * name: The name of RAMBlock to find 2659 * 2660 * Returns: RAMBlock (or NULL if not found) 2661 */ 2662RAMBlock *qemu_ram_block_by_name(const char *name) 2663{ 2664 RAMBlock *block; 2665 2666 RAMBLOCK_FOREACH(block) { 2667 if (!strcmp(name, block->idstr)) { 2668 return block; 2669 } 2670 } 2671 2672 return NULL; 2673} 2674 2675/* Some of the softmmu routines need to translate from a host pointer 2676 (typically a TLB entry) back to a ram offset. */ 2677ram_addr_t qemu_ram_addr_from_host(void *ptr) 2678{ 2679 RAMBlock *block; 2680 ram_addr_t offset; 2681 2682 block = qemu_ram_block_from_host(ptr, false, &offset); 2683 if (!block) { 2684 return RAM_ADDR_INVALID; 2685 } 2686 2687 return block->offset + offset; 2688} 2689 2690/* Generate a debug exception if a watchpoint has been hit. */ 2691void cpu_check_watchpoint(CPUState *cpu, vaddr addr, vaddr len, 2692 MemTxAttrs attrs, int flags, uintptr_t ra) 2693{ 2694 CPUClass *cc = CPU_GET_CLASS(cpu); 2695 CPUWatchpoint *wp; 2696 2697 assert(tcg_enabled()); 2698 if (cpu->watchpoint_hit) { 2699 /* 2700 * We re-entered the check after replacing the TB. 2701 * Now raise the debug interrupt so that it will 2702 * trigger after the current instruction. 2703 */ 2704 qemu_mutex_lock_iothread(); 2705 cpu_interrupt(cpu, CPU_INTERRUPT_DEBUG); 2706 qemu_mutex_unlock_iothread(); 2707 return; 2708 } 2709 2710 addr = cc->adjust_watchpoint_address(cpu, addr, len); 2711 QTAILQ_FOREACH(wp, &cpu->watchpoints, entry) { 2712 if (watchpoint_address_matches(wp, addr, len) 2713 && (wp->flags & flags)) { 2714 if (flags == BP_MEM_READ) { 2715 wp->flags |= BP_WATCHPOINT_HIT_READ; 2716 } else { 2717 wp->flags |= BP_WATCHPOINT_HIT_WRITE; 2718 } 2719 wp->hitaddr = MAX(addr, wp->vaddr); 2720 wp->hitattrs = attrs; 2721 if (!cpu->watchpoint_hit) { 2722 if (wp->flags & BP_CPU && 2723 !cc->debug_check_watchpoint(cpu, wp)) { 2724 wp->flags &= ~BP_WATCHPOINT_HIT; 2725 continue; 2726 } 2727 cpu->watchpoint_hit = wp; 2728 2729 mmap_lock(); 2730 tb_check_watchpoint(cpu, ra); 2731 if (wp->flags & BP_STOP_BEFORE_ACCESS) { 2732 cpu->exception_index = EXCP_DEBUG; 2733 mmap_unlock(); 2734 cpu_loop_exit_restore(cpu, ra); 2735 } else { 2736 /* Force execution of one insn next time. */ 2737 cpu->cflags_next_tb = 1 | curr_cflags(); 2738 mmap_unlock(); 2739 if (ra) { 2740 cpu_restore_state(cpu, ra, true); 2741 } 2742 cpu_loop_exit_noexc(cpu); 2743 } 2744 } 2745 } else { 2746 wp->flags &= ~BP_WATCHPOINT_HIT; 2747 } 2748 } 2749} 2750 2751static MemTxResult flatview_read(FlatView *fv, hwaddr addr, 2752 MemTxAttrs attrs, void *buf, hwaddr len); 2753static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, 2754 const void *buf, hwaddr len); 2755static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, 2756 bool is_write, MemTxAttrs attrs); 2757 2758static MemTxResult subpage_read(void *opaque, hwaddr addr, uint64_t *data, 2759 unsigned len, MemTxAttrs attrs) 2760{ 2761 subpage_t *subpage = opaque; 2762 uint8_t buf[8]; 2763 MemTxResult res; 2764 2765#if defined(DEBUG_SUBPAGE) 2766 printf("%s: subpage %p len %u addr " TARGET_FMT_plx "\n", __func__, 2767 subpage, len, addr); 2768#endif 2769 res = flatview_read(subpage->fv, addr + subpage->base, attrs, buf, len); 2770 if (res) { 2771 return res; 2772 } 2773 *data = ldn_p(buf, len); 2774 return MEMTX_OK; 2775} 2776 2777static MemTxResult subpage_write(void *opaque, hwaddr addr, 2778 uint64_t value, unsigned len, MemTxAttrs attrs) 2779{ 2780 subpage_t *subpage = opaque; 2781 uint8_t buf[8]; 2782 2783#if defined(DEBUG_SUBPAGE) 2784 printf("%s: subpage %p len %u addr " TARGET_FMT_plx 2785 " value %"PRIx64"\n", 2786 __func__, subpage, len, addr, value); 2787#endif 2788 stn_p(buf, len, value); 2789 return flatview_write(subpage->fv, addr + subpage->base, attrs, buf, len); 2790} 2791 2792static bool subpage_accepts(void *opaque, hwaddr addr, 2793 unsigned len, bool is_write, 2794 MemTxAttrs attrs) 2795{ 2796 subpage_t *subpage = opaque; 2797#if defined(DEBUG_SUBPAGE) 2798 printf("%s: subpage %p %c len %u addr " TARGET_FMT_plx "\n", 2799 __func__, subpage, is_write ? 'w' : 'r', len, addr); 2800#endif 2801 2802 return flatview_access_valid(subpage->fv, addr + subpage->base, 2803 len, is_write, attrs); 2804} 2805 2806static const MemoryRegionOps subpage_ops = { 2807 .read_with_attrs = subpage_read, 2808 .write_with_attrs = subpage_write, 2809 .impl.min_access_size = 1, 2810 .impl.max_access_size = 8, 2811 .valid.min_access_size = 1, 2812 .valid.max_access_size = 8, 2813 .valid.accepts = subpage_accepts, 2814 .endianness = DEVICE_NATIVE_ENDIAN, 2815}; 2816 2817static int subpage_register(subpage_t *mmio, uint32_t start, uint32_t end, 2818 uint16_t section) 2819{ 2820 int idx, eidx; 2821 2822 if (start >= TARGET_PAGE_SIZE || end >= TARGET_PAGE_SIZE) 2823 return -1; 2824 idx = SUBPAGE_IDX(start); 2825 eidx = SUBPAGE_IDX(end); 2826#if defined(DEBUG_SUBPAGE) 2827 printf("%s: %p start %08x end %08x idx %08x eidx %08x section %d\n", 2828 __func__, mmio, start, end, idx, eidx, section); 2829#endif 2830 for (; idx <= eidx; idx++) { 2831 mmio->sub_section[idx] = section; 2832 } 2833 2834 return 0; 2835} 2836 2837static subpage_t *subpage_init(FlatView *fv, hwaddr base) 2838{ 2839 subpage_t *mmio; 2840 2841 /* mmio->sub_section is set to PHYS_SECTION_UNASSIGNED with g_malloc0 */ 2842 mmio = g_malloc0(sizeof(subpage_t) + TARGET_PAGE_SIZE * sizeof(uint16_t)); 2843 mmio->fv = fv; 2844 mmio->base = base; 2845 memory_region_init_io(&mmio->iomem, NULL, &subpage_ops, mmio, 2846 NULL, TARGET_PAGE_SIZE); 2847 mmio->iomem.subpage = true; 2848#if defined(DEBUG_SUBPAGE) 2849 printf("%s: %p base " TARGET_FMT_plx " len %08x\n", __func__, 2850 mmio, base, TARGET_PAGE_SIZE); 2851#endif 2852 2853 return mmio; 2854} 2855 2856static uint16_t dummy_section(PhysPageMap *map, FlatView *fv, MemoryRegion *mr) 2857{ 2858 assert(fv); 2859 MemoryRegionSection section = { 2860 .fv = fv, 2861 .mr = mr, 2862 .offset_within_address_space = 0, 2863 .offset_within_region = 0, 2864 .size = int128_2_64(), 2865 }; 2866 2867 return phys_section_add(map, &section); 2868} 2869 2870MemoryRegionSection *iotlb_to_section(CPUState *cpu, 2871 hwaddr index, MemTxAttrs attrs) 2872{ 2873 int asidx = cpu_asidx_from_attrs(cpu, attrs); 2874 CPUAddressSpace *cpuas = &cpu->cpu_ases[asidx]; 2875 AddressSpaceDispatch *d = atomic_rcu_read(&cpuas->memory_dispatch); 2876 MemoryRegionSection *sections = d->map.sections; 2877 2878 return &sections[index & ~TARGET_PAGE_MASK]; 2879} 2880 2881static void io_mem_init(void) 2882{ 2883 memory_region_init_io(&io_mem_unassigned, NULL, &unassigned_mem_ops, NULL, 2884 NULL, UINT64_MAX); 2885} 2886 2887AddressSpaceDispatch *address_space_dispatch_new(FlatView *fv) 2888{ 2889 AddressSpaceDispatch *d = g_new0(AddressSpaceDispatch, 1); 2890 uint16_t n; 2891 2892 n = dummy_section(&d->map, fv, &io_mem_unassigned); 2893 assert(n == PHYS_SECTION_UNASSIGNED); 2894 2895 d->phys_map = (PhysPageEntry) { .ptr = PHYS_MAP_NODE_NIL, .skip = 1 }; 2896 2897 return d; 2898} 2899 2900void address_space_dispatch_free(AddressSpaceDispatch *d) 2901{ 2902 phys_sections_free(&d->map); 2903 g_free(d); 2904} 2905 2906static void do_nothing(CPUState *cpu, run_on_cpu_data d) 2907{ 2908} 2909 2910static void tcg_log_global_after_sync(MemoryListener *listener) 2911{ 2912 CPUAddressSpace *cpuas; 2913 2914 /* Wait for the CPU to end the current TB. This avoids the following 2915 * incorrect race: 2916 * 2917 * vCPU migration 2918 * ---------------------- ------------------------- 2919 * TLB check -> slow path 2920 * notdirty_mem_write 2921 * write to RAM 2922 * mark dirty 2923 * clear dirty flag 2924 * TLB check -> fast path 2925 * read memory 2926 * write to RAM 2927 * 2928 * by pushing the migration thread's memory read after the vCPU thread has 2929 * written the memory. 2930 */ 2931 if (replay_mode == REPLAY_MODE_NONE) { 2932 /* 2933 * VGA can make calls to this function while updating the screen. 2934 * In record/replay mode this causes a deadlock, because 2935 * run_on_cpu waits for rr mutex. Therefore no races are possible 2936 * in this case and no need for making run_on_cpu when 2937 * record/replay is not enabled. 2938 */ 2939 cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener); 2940 run_on_cpu(cpuas->cpu, do_nothing, RUN_ON_CPU_NULL); 2941 } 2942} 2943 2944static void tcg_commit(MemoryListener *listener) 2945{ 2946 CPUAddressSpace *cpuas; 2947 AddressSpaceDispatch *d; 2948 2949 assert(tcg_enabled()); 2950 /* since each CPU stores ram addresses in its TLB cache, we must 2951 reset the modified entries */ 2952 cpuas = container_of(listener, CPUAddressSpace, tcg_as_listener); 2953 cpu_reloading_memory_map(); 2954 /* The CPU and TLB are protected by the iothread lock. 2955 * We reload the dispatch pointer now because cpu_reloading_memory_map() 2956 * may have split the RCU critical section. 2957 */ 2958 d = address_space_to_dispatch(cpuas->as); 2959 atomic_rcu_set(&cpuas->memory_dispatch, d); 2960 tlb_flush(cpuas->cpu); 2961} 2962 2963static void memory_map_init(void) 2964{ 2965 system_memory = g_malloc(sizeof(*system_memory)); 2966 2967 memory_region_init(system_memory, NULL, "system", UINT64_MAX); 2968 address_space_init(&address_space_memory, system_memory, "memory"); 2969 2970 system_io = g_malloc(sizeof(*system_io)); 2971 memory_region_init_io(system_io, NULL, &unassigned_io_ops, NULL, "io", 2972 65536); 2973 address_space_init(&address_space_io, system_io, "I/O"); 2974} 2975 2976MemoryRegion *get_system_memory(void) 2977{ 2978 return system_memory; 2979} 2980 2981MemoryRegion *get_system_io(void) 2982{ 2983 return system_io; 2984} 2985 2986#endif /* !defined(CONFIG_USER_ONLY) */ 2987 2988/* physical memory access (slow version, mainly for debug) */ 2989#if defined(CONFIG_USER_ONLY) 2990int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, 2991 void *ptr, target_ulong len, bool is_write) 2992{ 2993 int flags; 2994 target_ulong l, page; 2995 void * p; 2996 uint8_t *buf = ptr; 2997 2998 while (len > 0) { 2999 page = addr & TARGET_PAGE_MASK; 3000 l = (page + TARGET_PAGE_SIZE) - addr; 3001 if (l > len) 3002 l = len; 3003 flags = page_get_flags(page); 3004 if (!(flags & PAGE_VALID)) 3005 return -1; 3006 if (is_write) { 3007 if (!(flags & PAGE_WRITE)) 3008 return -1; 3009 /* XXX: this code should not depend on lock_user */ 3010 if (!(p = lock_user(VERIFY_WRITE, addr, l, 0))) 3011 return -1; 3012 memcpy(p, buf, l); 3013 unlock_user(p, addr, l); 3014 } else { 3015 if (!(flags & PAGE_READ)) 3016 return -1; 3017 /* XXX: this code should not depend on lock_user */ 3018 if (!(p = lock_user(VERIFY_READ, addr, l, 1))) 3019 return -1; 3020 memcpy(buf, p, l); 3021 unlock_user(p, addr, 0); 3022 } 3023 len -= l; 3024 buf += l; 3025 addr += l; 3026 } 3027 return 0; 3028} 3029 3030#else 3031 3032static void invalidate_and_set_dirty(MemoryRegion *mr, hwaddr addr, 3033 hwaddr length) 3034{ 3035 uint8_t dirty_log_mask = memory_region_get_dirty_log_mask(mr); 3036 addr += memory_region_get_ram_addr(mr); 3037 3038 /* No early return if dirty_log_mask is or becomes 0, because 3039 * cpu_physical_memory_set_dirty_range will still call 3040 * xen_modified_memory. 3041 */ 3042 if (dirty_log_mask) { 3043 dirty_log_mask = 3044 cpu_physical_memory_range_includes_clean(addr, length, dirty_log_mask); 3045 } 3046 if (dirty_log_mask & (1 << DIRTY_MEMORY_CODE)) { 3047 assert(tcg_enabled()); 3048 tb_invalidate_phys_range(addr, addr + length); 3049 dirty_log_mask &= ~(1 << DIRTY_MEMORY_CODE); 3050 } 3051 cpu_physical_memory_set_dirty_range(addr, length, dirty_log_mask); 3052} 3053 3054void memory_region_flush_rom_device(MemoryRegion *mr, hwaddr addr, hwaddr size) 3055{ 3056 /* 3057 * In principle this function would work on other memory region types too, 3058 * but the ROM device use case is the only one where this operation is 3059 * necessary. Other memory regions should use the 3060 * address_space_read/write() APIs. 3061 */ 3062 assert(memory_region_is_romd(mr)); 3063 3064 invalidate_and_set_dirty(mr, addr, size); 3065} 3066 3067static int memory_access_size(MemoryRegion *mr, unsigned l, hwaddr addr) 3068{ 3069 unsigned access_size_max = mr->ops->valid.max_access_size; 3070 3071 /* Regions are assumed to support 1-4 byte accesses unless 3072 otherwise specified. */ 3073 if (access_size_max == 0) { 3074 access_size_max = 4; 3075 } 3076 3077 /* Bound the maximum access by the alignment of the address. */ 3078 if (!mr->ops->impl.unaligned) { 3079 unsigned align_size_max = addr & -addr; 3080 if (align_size_max != 0 && align_size_max < access_size_max) { 3081 access_size_max = align_size_max; 3082 } 3083 } 3084 3085 /* Don't attempt accesses larger than the maximum. */ 3086 if (l > access_size_max) { 3087 l = access_size_max; 3088 } 3089 l = pow2floor(l); 3090 3091 return l; 3092} 3093 3094static bool prepare_mmio_access(MemoryRegion *mr) 3095{ 3096 bool unlocked = !qemu_mutex_iothread_locked(); 3097 bool release_lock = false; 3098 3099 if (unlocked && mr->global_locking) { 3100 qemu_mutex_lock_iothread(); 3101 unlocked = false; 3102 release_lock = true; 3103 } 3104 if (mr->flush_coalesced_mmio) { 3105 if (unlocked) { 3106 qemu_mutex_lock_iothread(); 3107 } 3108 qemu_flush_coalesced_mmio_buffer(); 3109 if (unlocked) { 3110 qemu_mutex_unlock_iothread(); 3111 } 3112 } 3113 3114 return release_lock; 3115} 3116 3117/* Called within RCU critical section. */ 3118static MemTxResult flatview_write_continue(FlatView *fv, hwaddr addr, 3119 MemTxAttrs attrs, 3120 const void *ptr, 3121 hwaddr len, hwaddr addr1, 3122 hwaddr l, MemoryRegion *mr) 3123{ 3124 uint8_t *ram_ptr; 3125 uint64_t val; 3126 MemTxResult result = MEMTX_OK; 3127 bool release_lock = false; 3128 const uint8_t *buf = ptr; 3129 3130 for (;;) { 3131 if (!memory_access_is_direct(mr, true)) { 3132 release_lock |= prepare_mmio_access(mr); 3133 l = memory_access_size(mr, l, addr1); 3134 /* XXX: could force current_cpu to NULL to avoid 3135 potential bugs */ 3136 val = ldn_he_p(buf, l); 3137 result |= memory_region_dispatch_write(mr, addr1, val, 3138 size_memop(l), attrs); 3139 } else { 3140 /* RAM case */ 3141 ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); 3142 memcpy(ram_ptr, buf, l); 3143 invalidate_and_set_dirty(mr, addr1, l); 3144 } 3145 3146 if (release_lock) { 3147 qemu_mutex_unlock_iothread(); 3148 release_lock = false; 3149 } 3150 3151 len -= l; 3152 buf += l; 3153 addr += l; 3154 3155 if (!len) { 3156 break; 3157 } 3158 3159 l = len; 3160 mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); 3161 } 3162 3163 return result; 3164} 3165 3166/* Called from RCU critical section. */ 3167static MemTxResult flatview_write(FlatView *fv, hwaddr addr, MemTxAttrs attrs, 3168 const void *buf, hwaddr len) 3169{ 3170 hwaddr l; 3171 hwaddr addr1; 3172 MemoryRegion *mr; 3173 MemTxResult result = MEMTX_OK; 3174 3175 l = len; 3176 mr = flatview_translate(fv, addr, &addr1, &l, true, attrs); 3177 result = flatview_write_continue(fv, addr, attrs, buf, len, 3178 addr1, l, mr); 3179 3180 return result; 3181} 3182 3183/* Called within RCU critical section. */ 3184MemTxResult flatview_read_continue(FlatView *fv, hwaddr addr, 3185 MemTxAttrs attrs, void *ptr, 3186 hwaddr len, hwaddr addr1, hwaddr l, 3187 MemoryRegion *mr) 3188{ 3189 uint8_t *ram_ptr; 3190 uint64_t val; 3191 MemTxResult result = MEMTX_OK; 3192 bool release_lock = false; 3193 uint8_t *buf = ptr; 3194 3195 for (;;) { 3196 if (!memory_access_is_direct(mr, false)) { 3197 /* I/O case */ 3198 release_lock |= prepare_mmio_access(mr); 3199 l = memory_access_size(mr, l, addr1); 3200 result |= memory_region_dispatch_read(mr, addr1, &val, 3201 size_memop(l), attrs); 3202 stn_he_p(buf, l, val); 3203 } else { 3204 /* RAM case */ 3205 ram_ptr = qemu_ram_ptr_length(mr->ram_block, addr1, &l, false); 3206 memcpy(buf, ram_ptr, l); 3207 } 3208 3209 if (release_lock) { 3210 qemu_mutex_unlock_iothread(); 3211 release_lock = false; 3212 } 3213 3214 len -= l; 3215 buf += l; 3216 addr += l; 3217 3218 if (!len) { 3219 break; 3220 } 3221 3222 l = len; 3223 mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); 3224 } 3225 3226 return result; 3227} 3228 3229/* Called from RCU critical section. */ 3230static MemTxResult flatview_read(FlatView *fv, hwaddr addr, 3231 MemTxAttrs attrs, void *buf, hwaddr len) 3232{ 3233 hwaddr l; 3234 hwaddr addr1; 3235 MemoryRegion *mr; 3236 3237 l = len; 3238 mr = flatview_translate(fv, addr, &addr1, &l, false, attrs); 3239 return flatview_read_continue(fv, addr, attrs, buf, len, 3240 addr1, l, mr); 3241} 3242 3243MemTxResult address_space_read_full(AddressSpace *as, hwaddr addr, 3244 MemTxAttrs attrs, void *buf, hwaddr len) 3245{ 3246 MemTxResult result = MEMTX_OK; 3247 FlatView *fv; 3248 3249 if (len > 0) { 3250 RCU_READ_LOCK_GUARD(); 3251 fv = address_space_to_flatview(as); 3252 result = flatview_read(fv, addr, attrs, buf, len); 3253 } 3254 3255 return result; 3256} 3257 3258MemTxResult address_space_write(AddressSpace *as, hwaddr addr, 3259 MemTxAttrs attrs, 3260 const void *buf, hwaddr len) 3261{ 3262 MemTxResult result = MEMTX_OK; 3263 FlatView *fv; 3264 3265 if (len > 0) { 3266 RCU_READ_LOCK_GUARD(); 3267 fv = address_space_to_flatview(as); 3268 result = flatview_write(fv, addr, attrs, buf, len); 3269 } 3270 3271 return result; 3272} 3273 3274MemTxResult address_space_rw(AddressSpace *as, hwaddr addr, MemTxAttrs attrs, 3275 void *buf, hwaddr len, bool is_write) 3276{ 3277 if (is_write) { 3278 return address_space_write(as, addr, attrs, buf, len); 3279 } else { 3280 return address_space_read_full(as, addr, attrs, buf, len); 3281 } 3282} 3283 3284void cpu_physical_memory_rw(hwaddr addr, void *buf, 3285 hwaddr len, bool is_write) 3286{ 3287 address_space_rw(&address_space_memory, addr, MEMTXATTRS_UNSPECIFIED, 3288 buf, len, is_write); 3289} 3290 3291enum write_rom_type { 3292 WRITE_DATA, 3293 FLUSH_CACHE, 3294}; 3295 3296static inline MemTxResult address_space_write_rom_internal(AddressSpace *as, 3297 hwaddr addr, 3298 MemTxAttrs attrs, 3299 const void *ptr, 3300 hwaddr len, 3301 enum write_rom_type type) 3302{ 3303 hwaddr l; 3304 uint8_t *ram_ptr; 3305 hwaddr addr1; 3306 MemoryRegion *mr; 3307 const uint8_t *buf = ptr; 3308 3309 RCU_READ_LOCK_GUARD(); 3310 while (len > 0) { 3311 l = len; 3312 mr = address_space_translate(as, addr, &addr1, &l, true, attrs); 3313 3314 if (!(memory_region_is_ram(mr) || 3315 memory_region_is_romd(mr))) { 3316 l = memory_access_size(mr, l, addr1); 3317 } else { 3318 /* ROM/RAM case */ 3319 ram_ptr = qemu_map_ram_ptr(mr->ram_block, addr1); 3320 switch (type) { 3321 case WRITE_DATA: 3322 memcpy(ram_ptr, buf, l); 3323 invalidate_and_set_dirty(mr, addr1, l); 3324 break; 3325 case FLUSH_CACHE: 3326 flush_icache_range((uintptr_t)ram_ptr, (uintptr_t)ram_ptr + l); 3327 break; 3328 } 3329 } 3330 len -= l; 3331 buf += l; 3332 addr += l; 3333 } 3334 return MEMTX_OK; 3335} 3336 3337/* used for ROM loading : can write in RAM and ROM */ 3338MemTxResult address_space_write_rom(AddressSpace *as, hwaddr addr, 3339 MemTxAttrs attrs, 3340 const void *buf, hwaddr len) 3341{ 3342 return address_space_write_rom_internal(as, addr, attrs, 3343 buf, len, WRITE_DATA); 3344} 3345 3346void cpu_flush_icache_range(hwaddr start, hwaddr len) 3347{ 3348 /* 3349 * This function should do the same thing as an icache flush that was 3350 * triggered from within the guest. For TCG we are always cache coherent, 3351 * so there is no need to flush anything. For KVM / Xen we need to flush 3352 * the host's instruction cache at least. 3353 */ 3354 if (tcg_enabled()) { 3355 return; 3356 } 3357 3358 address_space_write_rom_internal(&address_space_memory, 3359 start, MEMTXATTRS_UNSPECIFIED, 3360 NULL, len, FLUSH_CACHE); 3361} 3362 3363typedef struct { 3364 MemoryRegion *mr; 3365 void *buffer; 3366 hwaddr addr; 3367 hwaddr len; 3368 bool in_use; 3369} BounceBuffer; 3370 3371static BounceBuffer bounce; 3372 3373typedef struct MapClient { 3374 QEMUBH *bh; 3375 QLIST_ENTRY(MapClient) link; 3376} MapClient; 3377 3378QemuMutex map_client_list_lock; 3379static QLIST_HEAD(, MapClient) map_client_list 3380 = QLIST_HEAD_INITIALIZER(map_client_list); 3381 3382static void cpu_unregister_map_client_do(MapClient *client) 3383{ 3384 QLIST_REMOVE(client, link); 3385 g_free(client); 3386} 3387 3388static void cpu_notify_map_clients_locked(void) 3389{ 3390 MapClient *client; 3391 3392 while (!QLIST_EMPTY(&map_client_list)) { 3393 client = QLIST_FIRST(&map_client_list); 3394 qemu_bh_schedule(client->bh); 3395 cpu_unregister_map_client_do(client); 3396 } 3397} 3398 3399void cpu_register_map_client(QEMUBH *bh) 3400{ 3401 MapClient *client = g_malloc(sizeof(*client)); 3402 3403 qemu_mutex_lock(&map_client_list_lock); 3404 client->bh = bh; 3405 QLIST_INSERT_HEAD(&map_client_list, client, link); 3406 if (!atomic_read(&bounce.in_use)) { 3407 cpu_notify_map_clients_locked(); 3408 } 3409 qemu_mutex_unlock(&map_client_list_lock); 3410} 3411 3412void cpu_exec_init_all(void) 3413{ 3414 qemu_mutex_init(&ram_list.mutex); 3415 /* The data structures we set up here depend on knowing the page size, 3416 * so no more changes can be made after this point. 3417 * In an ideal world, nothing we did before we had finished the 3418 * machine setup would care about the target page size, and we could 3419 * do this much later, rather than requiring board models to state 3420 * up front what their requirements are. 3421 */ 3422 finalize_target_page_bits(); 3423 io_mem_init(); 3424 memory_map_init(); 3425 qemu_mutex_init(&map_client_list_lock); 3426} 3427 3428void cpu_unregister_map_client(QEMUBH *bh) 3429{ 3430 MapClient *client; 3431 3432 qemu_mutex_lock(&map_client_list_lock); 3433 QLIST_FOREACH(client, &map_client_list, link) { 3434 if (client->bh == bh) { 3435 cpu_unregister_map_client_do(client); 3436 break; 3437 } 3438 } 3439 qemu_mutex_unlock(&map_client_list_lock); 3440} 3441 3442static void cpu_notify_map_clients(void) 3443{ 3444 qemu_mutex_lock(&map_client_list_lock); 3445 cpu_notify_map_clients_locked(); 3446 qemu_mutex_unlock(&map_client_list_lock); 3447} 3448 3449static bool flatview_access_valid(FlatView *fv, hwaddr addr, hwaddr len, 3450 bool is_write, MemTxAttrs attrs) 3451{ 3452 MemoryRegion *mr; 3453 hwaddr l, xlat; 3454 3455 while (len > 0) { 3456 l = len; 3457 mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); 3458 if (!memory_access_is_direct(mr, is_write)) { 3459 l = memory_access_size(mr, l, addr); 3460 if (!memory_region_access_valid(mr, xlat, l, is_write, attrs)) { 3461 return false; 3462 } 3463 } 3464 3465 len -= l; 3466 addr += l; 3467 } 3468 return true; 3469} 3470 3471bool address_space_access_valid(AddressSpace *as, hwaddr addr, 3472 hwaddr len, bool is_write, 3473 MemTxAttrs attrs) 3474{ 3475 FlatView *fv; 3476 bool result; 3477 3478 RCU_READ_LOCK_GUARD(); 3479 fv = address_space_to_flatview(as); 3480 result = flatview_access_valid(fv, addr, len, is_write, attrs); 3481 return result; 3482} 3483 3484static hwaddr 3485flatview_extend_translation(FlatView *fv, hwaddr addr, 3486 hwaddr target_len, 3487 MemoryRegion *mr, hwaddr base, hwaddr len, 3488 bool is_write, MemTxAttrs attrs) 3489{ 3490 hwaddr done = 0; 3491 hwaddr xlat; 3492 MemoryRegion *this_mr; 3493 3494 for (;;) { 3495 target_len -= len; 3496 addr += len; 3497 done += len; 3498 if (target_len == 0) { 3499 return done; 3500 } 3501 3502 len = target_len; 3503 this_mr = flatview_translate(fv, addr, &xlat, 3504 &len, is_write, attrs); 3505 if (this_mr != mr || xlat != base + done) { 3506 return done; 3507 } 3508 } 3509} 3510 3511/* Map a physical memory region into a host virtual address. 3512 * May map a subset of the requested range, given by and returned in *plen. 3513 * May return NULL if resources needed to perform the mapping are exhausted. 3514 * Use only for reads OR writes - not for read-modify-write operations. 3515 * Use cpu_register_map_client() to know when retrying the map operation is 3516 * likely to succeed. 3517 */ 3518void *address_space_map(AddressSpace *as, 3519 hwaddr addr, 3520 hwaddr *plen, 3521 bool is_write, 3522 MemTxAttrs attrs) 3523{ 3524 hwaddr len = *plen; 3525 hwaddr l, xlat; 3526 MemoryRegion *mr; 3527 void *ptr; 3528 FlatView *fv; 3529 3530 if (len == 0) { 3531 return NULL; 3532 } 3533 3534 l = len; 3535 RCU_READ_LOCK_GUARD(); 3536 fv = address_space_to_flatview(as); 3537 mr = flatview_translate(fv, addr, &xlat, &l, is_write, attrs); 3538 3539 if (!memory_access_is_direct(mr, is_write)) { 3540 if (atomic_xchg(&bounce.in_use, true)) { 3541 return NULL; 3542 } 3543 /* Avoid unbounded allocations */ 3544 l = MIN(l, TARGET_PAGE_SIZE); 3545 bounce.buffer = qemu_memalign(TARGET_PAGE_SIZE, l); 3546 bounce.addr = addr; 3547 bounce.len = l; 3548 3549 memory_region_ref(mr); 3550 bounce.mr = mr; 3551 if (!is_write) { 3552 flatview_read(fv, addr, MEMTXATTRS_UNSPECIFIED, 3553 bounce.buffer, l); 3554 } 3555 3556 *plen = l; 3557 return bounce.buffer; 3558 } 3559 3560 3561 memory_region_ref(mr); 3562 *plen = flatview_extend_translation(fv, addr, len, mr, xlat, 3563 l, is_write, attrs); 3564 ptr = qemu_ram_ptr_length(mr->ram_block, xlat, plen, true); 3565 3566 return ptr; 3567} 3568 3569/* Unmaps a memory region previously mapped by address_space_map(). 3570 * Will also mark the memory as dirty if is_write is true. access_len gives 3571 * the amount of memory that was actually read or written by the caller. 3572 */ 3573void address_space_unmap(AddressSpace *as, void *buffer, hwaddr len, 3574 bool is_write, hwaddr access_len) 3575{ 3576 if (buffer != bounce.buffer) { 3577 MemoryRegion *mr; 3578 ram_addr_t addr1; 3579 3580 mr = memory_region_from_host(buffer, &addr1); 3581 assert(mr != NULL); 3582 if (is_write) { 3583 invalidate_and_set_dirty(mr, addr1, access_len); 3584 } 3585 if (xen_enabled()) { 3586 xen_invalidate_map_cache_entry(buffer); 3587 } 3588 memory_region_unref(mr); 3589 return; 3590 } 3591 if (is_write) { 3592 address_space_write(as, bounce.addr, MEMTXATTRS_UNSPECIFIED, 3593 bounce.buffer, access_len); 3594 } 3595 qemu_vfree(bounce.buffer); 3596 bounce.buffer = NULL; 3597 memory_region_unref(bounce.mr); 3598 atomic_mb_set(&bounce.in_use, false); 3599 cpu_notify_map_clients(); 3600} 3601 3602void *cpu_physical_memory_map(hwaddr addr, 3603 hwaddr *plen, 3604 bool is_write) 3605{ 3606 return address_space_map(&address_space_memory, addr, plen, is_write, 3607 MEMTXATTRS_UNSPECIFIED); 3608} 3609 3610void cpu_physical_memory_unmap(void *buffer, hwaddr len, 3611 bool is_write, hwaddr access_len) 3612{ 3613 return address_space_unmap(&address_space_memory, buffer, len, is_write, access_len); 3614} 3615 3616#define ARG1_DECL AddressSpace *as 3617#define ARG1 as 3618#define SUFFIX 3619#define TRANSLATE(...) address_space_translate(as, __VA_ARGS__) 3620#define RCU_READ_LOCK(...) rcu_read_lock() 3621#define RCU_READ_UNLOCK(...) rcu_read_unlock() 3622#include "memory_ldst.inc.c" 3623 3624int64_t address_space_cache_init(MemoryRegionCache *cache, 3625 AddressSpace *as, 3626 hwaddr addr, 3627 hwaddr len, 3628 bool is_write) 3629{ 3630 AddressSpaceDispatch *d; 3631 hwaddr l; 3632 MemoryRegion *mr; 3633 3634 assert(len > 0); 3635 3636 l = len; 3637 cache->fv = address_space_get_flatview(as); 3638 d = flatview_to_dispatch(cache->fv); 3639 cache->mrs = *address_space_translate_internal(d, addr, &cache->xlat, &l, true); 3640 3641 mr = cache->mrs.mr; 3642 memory_region_ref(mr); 3643 if (memory_access_is_direct(mr, is_write)) { 3644 /* We don't care about the memory attributes here as we're only 3645 * doing this if we found actual RAM, which behaves the same 3646 * regardless of attributes; so UNSPECIFIED is fine. 3647 */ 3648 l = flatview_extend_translation(cache->fv, addr, len, mr, 3649 cache->xlat, l, is_write, 3650 MEMTXATTRS_UNSPECIFIED); 3651 cache->ptr = qemu_ram_ptr_length(mr->ram_block, cache->xlat, &l, true); 3652 } else { 3653 cache->ptr = NULL; 3654 } 3655 3656 cache->len = l; 3657 cache->is_write = is_write; 3658 return l; 3659} 3660 3661void address_space_cache_invalidate(MemoryRegionCache *cache, 3662 hwaddr addr, 3663 hwaddr access_len) 3664{ 3665 assert(cache->is_write); 3666 if (likely(cache->ptr)) { 3667 invalidate_and_set_dirty(cache->mrs.mr, addr + cache->xlat, access_len); 3668 } 3669} 3670 3671void address_space_cache_destroy(MemoryRegionCache *cache) 3672{ 3673 if (!cache->mrs.mr) { 3674 return; 3675 } 3676 3677 if (xen_enabled()) { 3678 xen_invalidate_map_cache_entry(cache->ptr); 3679 } 3680 memory_region_unref(cache->mrs.mr); 3681 flatview_unref(cache->fv); 3682 cache->mrs.mr = NULL; 3683 cache->fv = NULL; 3684} 3685 3686/* Called from RCU critical section. This function has the same 3687 * semantics as address_space_translate, but it only works on a 3688 * predefined range of a MemoryRegion that was mapped with 3689 * address_space_cache_init. 3690 */ 3691static inline MemoryRegion *address_space_translate_cached( 3692 MemoryRegionCache *cache, hwaddr addr, hwaddr *xlat, 3693 hwaddr *plen, bool is_write, MemTxAttrs attrs) 3694{ 3695 MemoryRegionSection section; 3696 MemoryRegion *mr; 3697 IOMMUMemoryRegion *iommu_mr; 3698 AddressSpace *target_as; 3699 3700 assert(!cache->ptr); 3701 *xlat = addr + cache->xlat; 3702 3703 mr = cache->mrs.mr; 3704 iommu_mr = memory_region_get_iommu(mr); 3705 if (!iommu_mr) { 3706 /* MMIO region. */ 3707 return mr; 3708 } 3709 3710 section = address_space_translate_iommu(iommu_mr, xlat, plen, 3711 NULL, is_write, true, 3712 &target_as, attrs); 3713 return section.mr; 3714} 3715 3716/* Called from RCU critical section. address_space_read_cached uses this 3717 * out of line function when the target is an MMIO or IOMMU region. 3718 */ 3719void 3720address_space_read_cached_slow(MemoryRegionCache *cache, hwaddr addr, 3721 void *buf, hwaddr len) 3722{ 3723 hwaddr addr1, l; 3724 MemoryRegion *mr; 3725 3726 l = len; 3727 mr = address_space_translate_cached(cache, addr, &addr1, &l, false, 3728 MEMTXATTRS_UNSPECIFIED); 3729 flatview_read_continue(cache->fv, 3730 addr, MEMTXATTRS_UNSPECIFIED, buf, len, 3731 addr1, l, mr); 3732} 3733 3734/* Called from RCU critical section. address_space_write_cached uses this 3735 * out of line function when the target is an MMIO or IOMMU region. 3736 */ 3737void 3738address_space_write_cached_slow(MemoryRegionCache *cache, hwaddr addr, 3739 const void *buf, hwaddr len) 3740{ 3741 hwaddr addr1, l; 3742 MemoryRegion *mr; 3743 3744 l = len; 3745 mr = address_space_translate_cached(cache, addr, &addr1, &l, true, 3746 MEMTXATTRS_UNSPECIFIED); 3747 flatview_write_continue(cache->fv, 3748 addr, MEMTXATTRS_UNSPECIFIED, buf, len, 3749 addr1, l, mr); 3750} 3751 3752#define ARG1_DECL MemoryRegionCache *cache 3753#define ARG1 cache 3754#define SUFFIX _cached_slow 3755#define TRANSLATE(...) address_space_translate_cached(cache, __VA_ARGS__) 3756#define RCU_READ_LOCK() ((void)0) 3757#define RCU_READ_UNLOCK() ((void)0) 3758#include "memory_ldst.inc.c" 3759 3760/* virtual memory access for debug (includes writing to ROM) */ 3761int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr, 3762 void *ptr, target_ulong len, bool is_write) 3763{ 3764 hwaddr phys_addr; 3765 target_ulong l, page; 3766 uint8_t *buf = ptr; 3767 3768 cpu_synchronize_state(cpu); 3769 while (len > 0) { 3770 int asidx; 3771 MemTxAttrs attrs; 3772 3773 page = addr & TARGET_PAGE_MASK; 3774 phys_addr = cpu_get_phys_page_attrs_debug(cpu, page, &attrs); 3775 asidx = cpu_asidx_from_attrs(cpu, attrs); 3776 /* if no physical page mapped, return an error */ 3777 if (phys_addr == -1) 3778 return -1; 3779 l = (page + TARGET_PAGE_SIZE) - addr; 3780 if (l > len) 3781 l = len; 3782 phys_addr += (addr & ~TARGET_PAGE_MASK); 3783 if (is_write) { 3784 address_space_write_rom(cpu->cpu_ases[asidx].as, phys_addr, 3785 attrs, buf, l); 3786 } else { 3787 address_space_read(cpu->cpu_ases[asidx].as, phys_addr, attrs, buf, 3788 l); 3789 } 3790 len -= l; 3791 buf += l; 3792 addr += l; 3793 } 3794 return 0; 3795} 3796 3797/* 3798 * Allows code that needs to deal with migration bitmaps etc to still be built 3799 * target independent. 3800 */ 3801size_t qemu_target_page_size(void) 3802{ 3803 return TARGET_PAGE_SIZE; 3804} 3805 3806int qemu_target_page_bits(void) 3807{ 3808 return TARGET_PAGE_BITS; 3809} 3810 3811int qemu_target_page_bits_min(void) 3812{ 3813 return TARGET_PAGE_BITS_MIN; 3814} 3815#endif 3816 3817bool target_words_bigendian(void) 3818{ 3819#if defined(TARGET_WORDS_BIGENDIAN) 3820 return true; 3821#else 3822 return false; 3823#endif 3824} 3825 3826#ifndef CONFIG_USER_ONLY 3827bool cpu_physical_memory_is_io(hwaddr phys_addr) 3828{ 3829 MemoryRegion*mr; 3830 hwaddr l = 1; 3831 bool res; 3832 3833 RCU_READ_LOCK_GUARD(); 3834 mr = address_space_translate(&address_space_memory, 3835 phys_addr, &phys_addr, &l, false, 3836 MEMTXATTRS_UNSPECIFIED); 3837 3838 res = !(memory_region_is_ram(mr) || memory_region_is_romd(mr)); 3839 return res; 3840} 3841 3842int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque) 3843{ 3844 RAMBlock *block; 3845 int ret = 0; 3846 3847 RCU_READ_LOCK_GUARD(); 3848 RAMBLOCK_FOREACH(block) { 3849 ret = func(block, opaque); 3850 if (ret) { 3851 break; 3852 } 3853 } 3854 return ret; 3855} 3856 3857/* 3858 * Unmap pages of memory from start to start+length such that 3859 * they a) read as 0, b) Trigger whatever fault mechanism 3860 * the OS provides for postcopy. 3861 * The pages must be unmapped by the end of the function. 3862 * Returns: 0 on success, none-0 on failure 3863 * 3864 */ 3865int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) 3866{ 3867 int ret = -1; 3868 3869 uint8_t *host_startaddr = rb->host + start; 3870 3871 if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) { 3872 error_report("ram_block_discard_range: Unaligned start address: %p", 3873 host_startaddr); 3874 goto err; 3875 } 3876 3877 if ((start + length) <= rb->used_length) { 3878 bool need_madvise, need_fallocate; 3879 if (!QEMU_IS_ALIGNED(length, rb->page_size)) { 3880 error_report("ram_block_discard_range: Unaligned length: %zx", 3881 length); 3882 goto err; 3883 } 3884 3885 errno = ENOTSUP; /* If we are missing MADVISE etc */ 3886 3887 /* The logic here is messy; 3888 * madvise DONTNEED fails for hugepages 3889 * fallocate works on hugepages and shmem 3890 */ 3891 need_madvise = (rb->page_size == qemu_host_page_size); 3892 need_fallocate = rb->fd != -1; 3893 if (need_fallocate) { 3894 /* For a file, this causes the area of the file to be zero'd 3895 * if read, and for hugetlbfs also causes it to be unmapped 3896 * so a userfault will trigger. 3897 */ 3898#ifdef CONFIG_FALLOCATE_PUNCH_HOLE 3899 ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 3900 start, length); 3901 if (ret) { 3902 ret = -errno; 3903 error_report("ram_block_discard_range: Failed to fallocate " 3904 "%s:%" PRIx64 " +%zx (%d)", 3905 rb->idstr, start, length, ret); 3906 goto err; 3907 } 3908#else 3909 ret = -ENOSYS; 3910 error_report("ram_block_discard_range: fallocate not available/file" 3911 "%s:%" PRIx64 " +%zx (%d)", 3912 rb->idstr, start, length, ret); 3913 goto err; 3914#endif 3915 } 3916 if (need_madvise) { 3917 /* For normal RAM this causes it to be unmapped, 3918 * for shared memory it causes the local mapping to disappear 3919 * and to fall back on the file contents (which we just 3920 * fallocate'd away). 3921 */ 3922#if defined(CONFIG_MADVISE) 3923 ret = madvise(host_startaddr, length, MADV_DONTNEED); 3924 if (ret) { 3925 ret = -errno; 3926 error_report("ram_block_discard_range: Failed to discard range " 3927 "%s:%" PRIx64 " +%zx (%d)", 3928 rb->idstr, start, length, ret); 3929 goto err; 3930 } 3931#else 3932 ret = -ENOSYS; 3933 error_report("ram_block_discard_range: MADVISE not available" 3934 "%s:%" PRIx64 " +%zx (%d)", 3935 rb->idstr, start, length, ret); 3936 goto err; 3937#endif 3938 } 3939 trace_ram_block_discard_range(rb->idstr, host_startaddr, length, 3940 need_madvise, need_fallocate, ret); 3941 } else { 3942 error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64 3943 "/%zx/" RAM_ADDR_FMT")", 3944 rb->idstr, start, length, rb->used_length); 3945 } 3946 3947err: 3948 return ret; 3949} 3950 3951bool ramblock_is_pmem(RAMBlock *rb) 3952{ 3953 return rb->flags & RAM_PMEM; 3954} 3955 3956#endif 3957 3958void page_size_init(void) 3959{ 3960 /* NOTE: we can always suppose that qemu_host_page_size >= 3961 TARGET_PAGE_SIZE */ 3962 if (qemu_host_page_size == 0) { 3963 qemu_host_page_size = qemu_real_host_page_size; 3964 } 3965 if (qemu_host_page_size < TARGET_PAGE_SIZE) { 3966 qemu_host_page_size = TARGET_PAGE_SIZE; 3967 } 3968 qemu_host_page_mask = -(intptr_t)qemu_host_page_size; 3969} 3970 3971#if !defined(CONFIG_USER_ONLY) 3972 3973static void mtree_print_phys_entries(int start, int end, int skip, int ptr) 3974{ 3975 if (start == end - 1) { 3976 qemu_printf("\t%3d ", start); 3977 } else { 3978 qemu_printf("\t%3d..%-3d ", start, end - 1); 3979 } 3980 qemu_printf(" skip=%d ", skip); 3981 if (ptr == PHYS_MAP_NODE_NIL) { 3982 qemu_printf(" ptr=NIL"); 3983 } else if (!skip) { 3984 qemu_printf(" ptr=#%d", ptr); 3985 } else { 3986 qemu_printf(" ptr=[%d]", ptr); 3987 } 3988 qemu_printf("\n"); 3989} 3990 3991#define MR_SIZE(size) (int128_nz(size) ? (hwaddr)int128_get64( \ 3992 int128_sub((size), int128_one())) : 0) 3993 3994void mtree_print_dispatch(AddressSpaceDispatch *d, MemoryRegion *root) 3995{ 3996 int i; 3997 3998 qemu_printf(" Dispatch\n"); 3999 qemu_printf(" Physical sections\n"); 4000 4001 for (i = 0; i < d->map.sections_nb; ++i) { 4002 MemoryRegionSection *s = d->map.sections + i; 4003 const char *names[] = { " [unassigned]", " [not dirty]", 4004 " [ROM]", " [watch]" }; 4005 4006 qemu_printf(" #%d @" TARGET_FMT_plx ".." TARGET_FMT_plx 4007 " %s%s%s%s%s", 4008 i, 4009 s->offset_within_address_space, 4010 s->offset_within_address_space + MR_SIZE(s->mr->size), 4011 s->mr->name ? s->mr->name : "(noname)", 4012 i < ARRAY_SIZE(names) ? names[i] : "", 4013 s->mr == root ? " [ROOT]" : "", 4014 s == d->mru_section ? " [MRU]" : "", 4015 s->mr->is_iommu ? " [iommu]" : ""); 4016 4017 if (s->mr->alias) { 4018 qemu_printf(" alias=%s", s->mr->alias->name ? 4019 s->mr->alias->name : "noname"); 4020 } 4021 qemu_printf("\n"); 4022 } 4023 4024 qemu_printf(" Nodes (%d bits per level, %d levels) ptr=[%d] skip=%d\n", 4025 P_L2_BITS, P_L2_LEVELS, d->phys_map.ptr, d->phys_map.skip); 4026 for (i = 0; i < d->map.nodes_nb; ++i) { 4027 int j, jprev; 4028 PhysPageEntry prev; 4029 Node *n = d->map.nodes + i; 4030 4031 qemu_printf(" [%d]\n", i); 4032 4033 for (j = 0, jprev = 0, prev = *n[0]; j < ARRAY_SIZE(*n); ++j) { 4034 PhysPageEntry *pe = *n + j; 4035 4036 if (pe->ptr == prev.ptr && pe->skip == prev.skip) { 4037 continue; 4038 } 4039 4040 mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr); 4041 4042 jprev = j; 4043 prev = *pe; 4044 } 4045 4046 if (jprev != ARRAY_SIZE(*n)) { 4047 mtree_print_phys_entries(jprev, j, prev.skip, prev.ptr); 4048 } 4049 } 4050} 4051 4052#endif